summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU')
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPU.h33
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPU.td166
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp14
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp8
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp100
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp425
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h33
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp8
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h6
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp301
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp22
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h9
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp324
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp1078
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h85
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp5
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h18
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td77
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td125
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td5
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp137
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h15
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp44
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h47
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp2
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h42
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp31
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h203
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp189
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h254
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp169
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h23
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp13
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h3
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp5
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h12
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp149
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp49
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp1626
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td1350
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/CIInstructions.td336
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td139
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/DSInstructions.td906
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp238
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h173
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td143
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td530
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp258
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h9
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp312
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h54
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp485
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h200
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp137
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp26
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h13
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp13
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h11
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp408
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h26
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp306
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h37
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp18
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp129
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td763
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Processors.td70
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp38
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp81
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp3
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h12
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp688
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp66
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h27
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600Instructions.td83
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h7
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h20
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp75
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp6
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp11
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp2
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIDefines.h196
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp160
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp668
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp455
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h30
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp1954
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h44
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp329
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp130
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td716
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp1260
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h214
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td3192
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstructions.td3193
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td29
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp300
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp930
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp32
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp57
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h142
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp80
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h58
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp304
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp1233
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h152
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td178
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SISchedule.td6
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp208
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp424
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SMInstructions.td535
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td1232
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp17
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp282
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h151
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h131
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp29
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td277
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VIInstructions.td150
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td615
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td757
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td451
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td1144
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td350
129 files changed, 23805 insertions, 13078 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
index d4784b5..7b0a7f4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -11,22 +11,18 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
-#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
-class AMDGPUInstrPrinter;
-class AMDGPUSubtarget;
class AMDGPUTargetMachine;
class FunctionPass;
class GCNTargetMachine;
-struct MachineSchedContext;
-class MCAsmInfo;
-class raw_ostream;
-class ScheduleDAGInstrs;
+class ModulePass;
+class Pass;
class Target;
class TargetMachine;
+class PassRegistry;
// R600 Passes
FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
@@ -45,16 +41,12 @@ FunctionPass *createSILowerI1CopiesPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
FunctionPass *createSIWholeQuadModePass();
-FunctionPass *createSILowerControlFlowPass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIFixSGPRCopiesPass();
-FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitsPass();
FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
-ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
-
ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;
@@ -78,21 +70,30 @@ void initializeSIWholeQuadModePass(PassRegistry &);
extern char &SIWholeQuadModeID;
void initializeSILowerControlFlowPass(PassRegistry &);
-extern char &SILowerControlFlowPassID;
+extern char &SILowerControlFlowID;
+
+void initializeSIInsertSkipsPass(PassRegistry &);
+extern char &SIInsertSkipsPassID;
+void initializeSIOptimizeExecMaskingPass(PassRegistry &);
+extern char &SIOptimizeExecMaskingID;
// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
extern char &AMDGPUPromoteAllocaID;
-FunctionPass *createAMDGPUAddDivergenceMetadata(const AMDGPUSubtarget &ST);
Pass *createAMDGPUStructurizeCFGPass();
-FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
+FunctionPass *createAMDGPUISelDag(TargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
ModulePass *createAMDGPUAlwaysInlinePass();
ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
FunctionPass *createAMDGPUAnnotateUniformValues();
+FunctionPass* createAMDGPUUnifyMetadataPass();
+void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
+extern char &AMDGPUUnifyMetadataID;
+
void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
extern char &SIFixControlFlowLiveIntervalsID;
@@ -111,8 +112,8 @@ extern char &SIDebuggerInsertNopsID;
void initializeSIInsertWaitsPass(PassRegistry&);
extern char &SIInsertWaitsID;
-extern Target TheAMDGPUTarget;
-extern Target TheGCNTarget;
+Target &getTheAMDGPUTarget();
+Target &getTheGCNTarget();
namespace AMDGPU {
enum TargetIndex {
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
index 72c4553..1302200 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -67,6 +67,19 @@ def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
"Support unaligned global loads and stores"
>;
+def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
+ "UnalignedScratchAccess",
+ "true",
+ "Support unaligned scratch loads and stores"
+>;
+
+// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
+// XNACK. The current default kernel driver setting is:
+// - graphics ring: XNACK disabled
+// - compute ring: XNACK enabled
+//
+// If XNACK is enabled, the VMEM latency can be worse.
+// If XNACK is disabled, the 2 SGPRs can be used for general purposes.
def FeatureXNACK : SubtargetFeature<"xnack",
"EnableXNACK",
"true",
@@ -110,20 +123,6 @@ class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
-class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping>
- : SubtargetFeature <
- "isaver"#Major#"."#Minor#"."#Stepping,
- "IsaVersion",
- "ISAVersion"#Major#"_"#Minor#"_"#Stepping,
- "Instruction set version number"
->;
-
-def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0>;
-def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1>;
-def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0>;
-def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>;
-def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>;
-
class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
"localmemorysize"#Value,
"LocalMemorySize",
@@ -161,16 +160,46 @@ def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
"Has s_memrealtime instruction"
>;
+def FeatureInv2PiInlineImm : SubtargetFeature<"inv-2pi-inline-imm",
+ "HasInv2PiInlineImm",
+ "true",
+ "Has 1 / (2 * pi) as inline immediate"
+>;
+
def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
"Has16BitInsts",
"true",
"Has i16/f16 instructions"
>;
+def FeatureMovrel : SubtargetFeature<"movrel",
+ "HasMovrel",
+ "true",
+ "Has v_movrel*_b32 instructions"
+>;
+
+def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode",
+ "HasVGPRIndexMode",
+ "true",
+ "Has VGPR mode register indexing"
+>;
+
+def FeatureScalarStores : SubtargetFeature<"scalar-stores",
+ "HasScalarStores",
+ "true",
+ "Has store scalar memory instructions"
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
+def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
+ "FP16Denormals",
+ "true",
+ "Enable half precision denormal handling"
+>;
+
// Some instructions do not support denormals despite this flag. Using
// fp32 denormals also causes instructions to run at the double
// precision rate for the device.
@@ -253,6 +282,12 @@ def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
"Enable SI Machine Scheduler"
>;
+// Unless +-flat-for-global is specified, turn on FlatForGlobal for
+// all OS-es on VI and newer hardware to avoid assertion failures due
+// to missing ADDR64 variants of MUBUF instructions.
+// FIXME: moveToVALU should be able to handle converting addr64 MUBUF
+// instructions.
+
def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
"FlatForGlobal",
"true",
@@ -294,23 +329,76 @@ def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize32768,
FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
- FeatureLDSBankCount32]
+ FeatureLDSBankCount32, FeatureMovrel]
>;
def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize65536,
FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
- FeatureGCN1Encoding, FeatureCIInsts]
+ FeatureGCN1Encoding, FeatureCIInsts, FeatureMovrel]
>;
def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize65536,
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
- FeatureSMemRealTime
+ FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
+ FeatureScalarStores, FeatureInv2PiInlineImm
]
>;
+class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
+ list<SubtargetFeature> Implies>
+ : SubtargetFeature <
+ "isaver"#Major#"."#Minor#"."#Stepping,
+ "IsaVersion",
+ "ISAVersion"#Major#"_"#Minor#"_"#Stepping,
+ "Instruction set version number",
+ Implies
+>;
+
+def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
+ [FeatureSeaIslands,
+ FeatureLDSBankCount32]>;
+
+def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
+ [FeatureSeaIslands,
+ HalfRate64Ops,
+ FeatureLDSBankCount32,
+ FeatureFastFMAF32]>;
+
+def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
+ [FeatureSeaIslands,
+ FeatureLDSBankCount16]>;
+
+def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0,
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount32,
+ FeatureSGPRInitBug]>;
+
+def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount32,
+ FeatureXNACK]>;
+
+def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount32,
+ FeatureSGPRInitBug]>;
+
+def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount32]>;
+
+def FeatureISAVersion8_0_4 : SubtargetFeatureISAVersion <8,0,4,
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount32]>;
+
+def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount16,
+ FeatureXNACK]>;
+
//===----------------------------------------------------------------------===//
// Debugger related subtarget features.
//===----------------------------------------------------------------------===//
@@ -349,10 +437,52 @@ def AMDGPUAsmParser : AsmParser {
let ShouldEmitMatchRegisterName = 0;
}
+def AMDGPUAsmWriter : AsmWriter {
+ int PassSubtarget = 1;
+}
+
+def AMDGPUAsmVariants {
+ string Default = "Default";
+ int Default_ID = 0;
+ string VOP3 = "VOP3";
+ int VOP3_ID = 1;
+ string SDWA = "SDWA";
+ int SDWA_ID = 2;
+ string DPP = "DPP";
+ int DPP_ID = 3;
+ string Disable = "Disable";
+ int Disable_ID = 4;
+}
+
+def DefaultAMDGPUAsmParserVariant : AsmParserVariant {
+ let Variant = AMDGPUAsmVariants.Default_ID;
+ let Name = AMDGPUAsmVariants.Default;
+}
+
+def VOP3AsmParserVariant : AsmParserVariant {
+ let Variant = AMDGPUAsmVariants.VOP3_ID;
+ let Name = AMDGPUAsmVariants.VOP3;
+}
+
+def SDWAAsmParserVariant : AsmParserVariant {
+ let Variant = AMDGPUAsmVariants.SDWA_ID;
+ let Name = AMDGPUAsmVariants.SDWA;
+}
+
+def DPPAsmParserVariant : AsmParserVariant {
+ let Variant = AMDGPUAsmVariants.DPP_ID;
+ let Name = AMDGPUAsmVariants.DPP;
+}
+
def AMDGPU : Target {
// Pull in Instruction Info:
let InstructionSet = AMDGPUInstrInfo;
let AssemblyParsers = [AMDGPUAsmParser];
+ let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant,
+ VOP3AsmParserVariant,
+ SDWAAsmParserVariant,
+ DPPAsmParserVariant];
+ let AssemblyWriters = [AMDGPUAsmWriter];
}
// Dummy Instruction itineraries for pseudo instructions
@@ -381,6 +511,8 @@ def isCIVI : Predicate <
def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
+def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">;
+
class PredicateControl {
Predicate SubtargetPredicate;
Predicate SIAssemblerPredicate = isSICI;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 63f5fb3..067a16a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -27,7 +27,7 @@ class AMDGPUAlwaysInline : public ModulePass {
public:
AMDGPUAlwaysInline() : ModulePass(ID) { }
bool runOnModule(Module &M) override;
- const char *getPassName() const override { return "AMDGPU Always Inline Pass"; }
+ StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; }
};
} // End anonymous namespace
@@ -35,8 +35,20 @@ public:
char AMDGPUAlwaysInline::ID = 0;
bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+ std::vector<GlobalAlias*> AliasesToRemove;
std::vector<Function *> FuncsToClone;
+ for (GlobalAlias &A : M.aliases()) {
+ if (Function* F = dyn_cast<Function>(A.getAliasee())) {
+ A.replaceAllUsesWith(F);
+ AliasesToRemove.push_back(&A);
+ }
+ }
+
+ for (GlobalAlias* A : AliasesToRemove) {
+ A->eraseFromParent();
+ }
+
for (Function &F : M) {
if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
!F.hasFnAttribute(Attribute::NoInline))
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 0910b28..c98d25e2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "llvm/ADT/Triple.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
@@ -35,7 +36,7 @@ public:
AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { }
bool runOnModule(Module &M) override;
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "AMDGPU Annotate Kernel Features";
}
@@ -188,7 +189,8 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
static const StringRef HSAIntrinsicToAttr[][2] = {
{ "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" },
- { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" }
+ { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" },
+ { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" }
};
// TODO: We should not add the attributes if the known compile time workgroup
@@ -200,7 +202,7 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
// always initialized.
bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
- if (TT.getOS() == Triple::AMDHSA) {
+ if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) {
Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
for (Function &F : M) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 2010cc9..c011be6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -15,7 +15,10 @@
#include "AMDGPU.h"
#include "AMDGPUIntrinsicInfo.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/Debug.h"
@@ -30,6 +33,10 @@ namespace {
class AMDGPUAnnotateUniformValues : public FunctionPass,
public InstVisitor<AMDGPUAnnotateUniformValues> {
DivergenceAnalysis *DA;
+ MemoryDependenceResults *MDR;
+ LoopInfo *LI;
+ DenseMap<Value*, GetElementPtrInst*> noClobberClones;
+ bool isKernelFunc;
public:
static char ID;
@@ -37,15 +44,19 @@ public:
FunctionPass(ID) { }
bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;
- const char *getPassName() const override { return "AMDGPU Annotate Uniform Values"; }
+ StringRef getPassName() const override {
+ return "AMDGPU Annotate Uniform Values";
+ }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DivergenceAnalysis>();
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.setPreservesAll();
}
void visitBranchInst(BranchInst &I);
void visitLoadInst(LoadInst &I);
-
+ bool isClobberedInFunction(LoadInst * Load);
};
} // End anonymous namespace
@@ -53,6 +64,8 @@ public:
INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
@@ -61,6 +74,46 @@ char AMDGPUAnnotateUniformValues::ID = 0;
static void setUniformMetadata(Instruction *I) {
I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
}
+static void setNoClobberMetadata(Instruction *I) {
+ I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
+}
+
+static void DFS(BasicBlock *Root, SetVector<BasicBlock*> & Set) {
+ for (auto I : predecessors(Root))
+ if (Set.insert(I))
+ DFS(I, Set);
+}
+
+bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
+ // 1. get Loop for the Load->getparent();
+ // 2. if it exists, collect all the BBs from the most outer
+ // loop and check for the writes. If NOT - start DFS over all preds.
+ // 3. Start DFS over all preds from the most outer loop header.
+ SetVector<BasicBlock *> Checklist;
+ BasicBlock *Start = Load->getParent();
+ Checklist.insert(Start);
+ const Value *Ptr = Load->getPointerOperand();
+ const Loop *L = LI->getLoopFor(Start);
+ if (L) {
+ const Loop *P = L;
+ do {
+ L = P;
+ P = P->getParentLoop();
+ } while (P);
+ Checklist.insert(L->block_begin(), L->block_end());
+ Start = L->getHeader();
+ }
+
+ DFS(Start, Checklist);
+ for (auto &BB : Checklist) {
+ BasicBlock::iterator StartIt = (BB == Load->getParent()) ?
+ BasicBlock::iterator(Load) : BB->end();
+ if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr),
+ true, StartIt, BB, Load).isClobber())
+ return true;
+ }
+ return false;
+}
void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
if (I.isUnconditional())
@@ -77,10 +130,39 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
Value *Ptr = I.getPointerOperand();
if (!DA->isUniform(Ptr))
return;
-
- if (Instruction *PtrI = dyn_cast<Instruction>(Ptr))
+ auto isGlobalLoad = [](LoadInst &Load)->bool {
+ return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+ };
+ // We're tracking up to the Function boundaries
+ // We cannot go beyond because of FunctionPass restrictions
+ // Thus we can ensure that memory not clobbered for memory
+ // operations that live in kernel only.
+ bool NotClobbered = isKernelFunc && !isClobberedInFunction(&I);
+ Instruction *PtrI = dyn_cast<Instruction>(Ptr);
+ if (!PtrI && NotClobbered && isGlobalLoad(I)) {
+ if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
+ // Lookup for the existing GEP
+ if (noClobberClones.count(Ptr)) {
+ PtrI = noClobberClones[Ptr];
+ } else {
+ // Create GEP of the Value
+ Function *F = I.getParent()->getParent();
+ Value *Idx = Constant::getIntegerValue(
+ Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
+ // Insert GEP at the entry to make it dominate all uses
+ PtrI = GetElementPtrInst::Create(
+ Ptr->getType()->getPointerElementType(), Ptr,
+ ArrayRef<Value*>(Idx), Twine(""), F->getEntryBlock().getFirstNonPHI());
+ }
+ I.replaceUsesOfWith(Ptr, PtrI);
+ }
+ }
+
+ if (PtrI) {
setUniformMetadata(PtrI);
-
+ if (NotClobbered)
+ setNoClobberMetadata(PtrI);
+ }
}
bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
@@ -91,9 +173,13 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
- DA = &getAnalysis<DivergenceAnalysis>();
- visit(F);
+ DA = &getAnalysis<DivergenceAnalysis>();
+ MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+ visit(F);
+ noClobberClones.clear();
return true;
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c9c95c7..974e79f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -39,9 +39,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "AMDGPURuntimeMetadata.h"
-using namespace ::AMDGPU;
using namespace llvm;
// TODO: This should get the default rounding mode from the kernel. We just set
@@ -87,13 +85,19 @@ createAMDGPUAsmPrinterPass(TargetMachine &tm,
}
extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
- TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
- TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
+ TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
+ createAMDGPUAsmPrinterPass);
+ TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
+ createAMDGPUAsmPrinterPass);
}
AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
- : AsmPrinter(TM, std::move(Streamer)) {}
+ : AsmPrinter(TM, std::move(Streamer)) {}
+
+StringRef AMDGPUAsmPrinter::getPassName() const {
+ return "AMDGPU Assembly Printer";
+}
void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
@@ -113,13 +117,30 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
"AMD", "AMDGPU");
- emitStartOfRuntimeMetadata(M);
+
+ // Emit runtime metadata.
+ TS->EmitRuntimeMetadata(M);
}
+bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
+ const MachineBasicBlock *MBB) const {
+ if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
+ return false;
+
+ if (MBB->empty())
+ return true;
+
+ // If this is a block implementing a long branch, an expression relative to
+ // the start of the block is needed. to the start of the block.
+ // XXX - Is there a smarter way to check this?
+ return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
+}
+
+
void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
SIProgramInfo KernelInfo;
- if (STM.isAmdHsaOS()) {
+ if (STM.isAmdCodeObjectV2(*MF)) {
getSIProgramInfo(KernelInfo, *MF);
EmitAmdKernelCodeT(*MF, KernelInfo);
}
@@ -128,11 +149,12 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
- if (MFI->isKernel() && STM.isAmdHsaOS()) {
+ if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) {
AMDGPUTargetStreamer *TS =
static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
- TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(),
- ELF::STT_AMDGPU_HSA_KERNEL);
+ SmallString<128> SymbolName;
+ getNameWithPrefix(SymbolName, MF->getFunction()),
+ TS->EmitAMDGPUSymbolType(SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
}
AsmPrinter::EmitFunctionEntryLabel();
@@ -154,12 +176,14 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
SetupMachineFunction(MF);
+ const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
MCContext &Context = getObjFileLowering().getContext();
- MCSectionELF *ConfigSection =
- Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
- OutStreamer->SwitchSection(ConfigSection);
+ if (!STM.isAmdHsaOS()) {
+ MCSectionELF *ConfigSection =
+ Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
+ OutStreamer->SwitchSection(ConfigSection);
+ }
- const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
SIProgramInfo KernelInfo;
if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
getSIProgramInfo(KernelInfo, MF);
@@ -198,6 +222,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
" bytes/workgroup (compile time only)", false);
+ OutStreamer->emitRawComment(" SGPRBlocks: " +
+ Twine(KernelInfo.SGPRBlocks), false);
+ OutStreamer->emitRawComment(" VGPRBlocks: " +
+ Twine(KernelInfo.VGPRBlocks), false);
+
+ OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " +
+ Twine(KernelInfo.NumSGPRsForWavesPerEU), false);
+ OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " +
+ Twine(KernelInfo.NumVGPRsForWavesPerEU), false);
+
OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
false);
OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
@@ -229,7 +263,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
} else {
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
OutStreamer->emitRawComment(
- Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
+ Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
}
}
@@ -247,8 +281,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
}
}
- emitRuntimeMetadata(*MF.getFunction());
-
return false;
}
@@ -282,7 +314,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
// Evergreen / Northern Islands
switch (MF.getFunction()->getCallingConv()) {
- default: // Fall through
+ default: LLVM_FALLTHROUGH;
case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
@@ -291,9 +323,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
} else {
// R600 / R700
switch (MF.getFunction()->getCallingConv()) {
- default: // Fall through
- case CallingConv::AMDGPU_GS: // Fall through
- case CallingConv::AMDGPU_CS: // Fall through
+ default: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
+ case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
}
@@ -301,13 +333,13 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
OutStreamer->EmitIntValue(RsrcReg, 4);
OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
- S_STACK_SIZE(MFI->StackSize), 4);
+ S_STACK_SIZE(MFI->CFStackSize), 4);
OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
- OutStreamer->EmitIntValue(alignTo(MFI->LDSSize, 4) >> 2, 4);
+ OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
}
}
@@ -331,7 +363,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
if (MI.isDebugValue())
continue;
- CodeSize += TII->getInstSizeInBytes(MI);
+ if (isVerbose())
+ CodeSize += TII->getInstSizeInBytes(MI);
unsigned numOperands = MI.getNumOperands();
for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
@@ -360,7 +393,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
case AMDGPU::FLAT_SCR:
case AMDGPU::FLAT_SCR_LO:
case AMDGPU::FLAT_SCR_HI:
- FlatUsed = true;
+ // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
+ // instructions aren't used to access the scratch buffer.
+ if (MFI->hasFlatScratchInit())
+ FlatUsed = true;
continue;
case AMDGPU::TBA:
@@ -369,26 +405,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
case AMDGPU::TMA:
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
- llvm_unreachable("Trap Handler registers should not be used");
- continue;
+ llvm_unreachable("trap handler registers should not be used");
default:
break;
}
if (AMDGPU::SReg_32RegClass.contains(reg)) {
- if (AMDGPU::TTMP_32RegClass.contains(reg)) {
- llvm_unreachable("Trap Handler registers should not be used");
- }
+ assert(!AMDGPU::TTMP_32RegClass.contains(reg) &&
+ "trap handler registers should not be used");
isSGPR = true;
width = 1;
} else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
isSGPR = false;
width = 1;
} else if (AMDGPU::SReg_64RegClass.contains(reg)) {
- if (AMDGPU::TTMP_64RegClass.contains(reg)) {
- llvm_unreachable("Trap Handler registers should not be used");
- }
+ assert(!AMDGPU::TTMP_64RegClass.contains(reg) &&
+ "trap handler registers should not be used");
isSGPR = true;
width = 2;
} else if (AMDGPU::VReg_64RegClass.contains(reg)) {
@@ -445,20 +478,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ExtraSGPRs = 6;
}
- MaxSGPR += ExtraSGPRs;
-
// Record first reserved register and reserved register count fields, and
// update max register counts if "amdgpu-debugger-reserve-regs" attribute was
- // specified.
- if (STM.debuggerReserveRegs()) {
- ProgInfo.ReservedVGPRFirst = MaxVGPR + 1;
- ProgInfo.ReservedVGPRCount = MFI->getDebuggerReservedVGPRCount();
- MaxVGPR += MFI->getDebuggerReservedVGPRCount();
- }
+ // requested.
+ ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0;
+ ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM);
// Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
// DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
- // attribute was specified.
+ // attribute was requested.
if (STM.debuggerEmitPrologue()) {
ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
@@ -466,21 +494,59 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
RI->getHWRegIndex(MFI->getScratchRSrcReg());
}
+ // Check the addressable register limit before we add ExtraSGPRs.
+ if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ !STM.hasSGPRInitBug()) {
+ unsigned MaxAddressableNumSGPRs = STM.getMaxNumSGPRs();
+ if (MaxSGPR + 1 > MaxAddressableNumSGPRs) {
+ // This can happen due to a compiler bug or when using inline asm.
+ LLVMContext &Ctx = MF.getFunction()->getContext();
+ DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
+ "addressable scalar registers",
+ MaxSGPR + 1, DS_Error,
+ DK_ResourceLimit, MaxAddressableNumSGPRs);
+ Ctx.diagnose(Diag);
+ MaxSGPR = MaxAddressableNumSGPRs - 1;
+ }
+ }
+
+ // Account for extra SGPRs and VGPRs reserved for debugger use.
+ MaxSGPR += ExtraSGPRs;
+ MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM);
+
// We found the maximum register index. They start at 0, so add one to get the
// number of registers.
ProgInfo.NumVGPR = MaxVGPR + 1;
ProgInfo.NumSGPR = MaxSGPR + 1;
- if (STM.hasSGPRInitBug()) {
- if (ProgInfo.NumSGPR > SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {
+ // Adjust number of registers used to meet default/requested minimum/maximum
+ // number of waves per execution unit request.
+ ProgInfo.NumSGPRsForWavesPerEU = std::max(
+ ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU()));
+ ProgInfo.NumVGPRsForWavesPerEU = std::max(
+ ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU()));
+
+ if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
+ STM.hasSGPRInitBug()) {
+ unsigned MaxNumSGPRs = STM.getMaxNumSGPRs();
+ if (ProgInfo.NumSGPR > MaxNumSGPRs) {
+ // This can happen due to a compiler bug or when using inline asm to use the
+ // registers which are usually reserved for vcc etc.
+
LLVMContext &Ctx = MF.getFunction()->getContext();
DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
- "SGPRs with SGPR init bug",
- ProgInfo.NumSGPR, DS_Error);
+ "scalar registers",
+ ProgInfo.NumSGPR, DS_Error,
+ DK_ResourceLimit, MaxNumSGPRs);
Ctx.diagnose(Diag);
+ ProgInfo.NumSGPR = MaxNumSGPRs;
+ ProgInfo.NumSGPRsForWavesPerEU = MaxNumSGPRs;
}
+ }
+ if (STM.hasSGPRInitBug()) {
ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+ ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
}
if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
@@ -490,26 +556,34 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
Ctx.diagnose(Diag);
}
- if (MFI->LDSSize > static_cast<unsigned>(STM.getLocalMemorySize())) {
+ if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
LLVMContext &Ctx = MF.getFunction()->getContext();
DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory",
- MFI->LDSSize, DS_Error);
+ MFI->getLDSSize(), DS_Error);
Ctx.diagnose(Diag);
}
- ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
- ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
+ // SGPRBlocks is actual number of SGPR blocks minus 1.
+ ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
+ RI->getSGPRAllocGranule());
+ ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1;
+
+ // VGPRBlocks is actual number of VGPR blocks minus 1.
+ ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
+ RI->getVGPRAllocGranule());
+ ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1;
+
// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
// register.
ProgInfo.FloatMode = getFPMode(MF);
- ProgInfo.IEEEMode = 0;
+ ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
// Make clamp modifier on NaN input returns 0.
ProgInfo.DX10Clamp = 1;
- const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
- ProgInfo.ScratchSize = FrameInfo->getStackSize();
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ ProgInfo.ScratchSize = FrameInfo.getStackSize();
ProgInfo.FlatUsed = FlatUsed;
ProgInfo.VCCUsed = VCCUsed;
@@ -524,10 +598,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
LDSAlignShift = 9;
}
- unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
- MFI->getMaximumWorkGroupSize(MF);
+ unsigned LDSSpillSize =
+ MFI->LDSWaveSpillSize * MFI->getMaxFlatWorkGroupSize();
- ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
+ ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
ProgInfo.LDSBlocks =
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
@@ -573,7 +647,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
static unsigned getRsrcReg(CallingConv::ID CallConv) {
switch (CallConv) {
- default: // Fall through
+ default: LLVM_FALLTHROUGH;
case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
@@ -703,7 +777,9 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
if (STM.isXNACKEnabled())
header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
- header.kernarg_segment_byte_size = MFI->ABIArgOffset;
+ // FIXME: Should use getKernArgSize
+ header.kernarg_segment_byte_size =
+ STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
header.wavefront_sgpr_count = KernelInfo.NumSGPR;
header.workitem_vgpr_count = KernelInfo.NumVGPR;
header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
@@ -711,6 +787,11 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+ // These alignment values are specified in powers of two, so alignment =
+ // 2^n. The minimum alignment is 2^4 = 16.
+ header.kernarg_segment_alignment = std::max((size_t)4,
+ countTrailingZeros(MFI->getMaxKernArgAlign()));
+
if (STM.debuggerEmitPrologue()) {
header.debug_wavefront_private_segment_offset_sgpr =
KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
@@ -745,231 +826,3 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
*TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
return false;
}
-
-// Emit a key and an integer value for runtime metadata.
-static void emitRuntimeMDIntValue(std::unique_ptr<MCStreamer> &Streamer,
- RuntimeMD::Key K, uint64_t V,
- unsigned Size) {
- Streamer->EmitIntValue(K, 1);
- Streamer->EmitIntValue(V, Size);
-}
-
-// Emit a key and a string value for runtime metadata.
-static void emitRuntimeMDStringValue(std::unique_ptr<MCStreamer> &Streamer,
- RuntimeMD::Key K, StringRef S) {
- Streamer->EmitIntValue(K, 1);
- Streamer->EmitIntValue(S.size(), 4);
- Streamer->EmitBytes(S);
-}
-
-// Emit a key and three integer values for runtime metadata.
-// The three integer values are obtained from MDNode \p Node;
-static void emitRuntimeMDThreeIntValues(std::unique_ptr<MCStreamer> &Streamer,
- RuntimeMD::Key K, MDNode *Node,
- unsigned Size) {
- Streamer->EmitIntValue(K, 1);
- Streamer->EmitIntValue(mdconst::extract<ConstantInt>(
- Node->getOperand(0))->getZExtValue(), Size);
- Streamer->EmitIntValue(mdconst::extract<ConstantInt>(
- Node->getOperand(1))->getZExtValue(), Size);
- Streamer->EmitIntValue(mdconst::extract<ConstantInt>(
- Node->getOperand(2))->getZExtValue(), Size);
-}
-
-void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) {
- OutStreamer->SwitchSection(getObjFileLowering().getContext()
- .getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0));
-
- emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion,
- RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2);
- if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
- if (MD->getNumOperands()) {
- auto Node = MD->getOperand(0);
- if (Node->getNumOperands() > 1) {
- emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage,
- RuntimeMD::OpenCL_C, 1);
- uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
- ->getZExtValue();
- uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
- ->getZExtValue();
- emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion,
- Major * 100 + Minor * 10, 2);
- }
- }
- }
-}
-
-static std::string getOCLTypeName(Type *Ty, bool isSigned) {
- if (VectorType* VecTy = dyn_cast<VectorType>(Ty)) {
- Type* EleTy = VecTy->getElementType();
- unsigned Size = VecTy->getVectorNumElements();
- return (Twine(getOCLTypeName(EleTy, isSigned)) + Twine(Size)).str();
- }
- switch (Ty->getTypeID()) {
- case Type::HalfTyID: return "half";
- case Type::FloatTyID: return "float";
- case Type::DoubleTyID: return "double";
- case Type::IntegerTyID: {
- if (!isSigned)
- return (Twine('u') + Twine(getOCLTypeName(Ty, true))).str();
- auto IntTy = cast<IntegerType>(Ty);
- auto BW = IntTy->getIntegerBitWidth();
- switch (BW) {
- case 8:
- return "char";
- case 16:
- return "short";
- case 32:
- return "int";
- case 64:
- return "long";
- default:
- return (Twine('i') + Twine(BW)).str();
- }
- }
- default:
- llvm_unreachable("invalid type");
- }
-}
-
-static RuntimeMD::KernelArg::ValueType getRuntimeMDValueType(
- Type *Ty, StringRef TypeName) {
- if (auto VT = dyn_cast<VectorType>(Ty))
- return getRuntimeMDValueType(VT->getElementType(), TypeName);
- else if (auto PT = dyn_cast<PointerType>(Ty))
- return getRuntimeMDValueType(PT->getElementType(), TypeName);
- else if (Ty->isHalfTy())
- return RuntimeMD::KernelArg::F16;
- else if (Ty->isFloatTy())
- return RuntimeMD::KernelArg::F32;
- else if (Ty->isDoubleTy())
- return RuntimeMD::KernelArg::F64;
- else if (IntegerType* intTy = dyn_cast<IntegerType>(Ty)) {
- bool Signed = !TypeName.startswith("u");
- switch (intTy->getIntegerBitWidth()) {
- case 8:
- return Signed ? RuntimeMD::KernelArg::I8 : RuntimeMD::KernelArg::U8;
- case 16:
- return Signed ? RuntimeMD::KernelArg::I16 : RuntimeMD::KernelArg::U16;
- case 32:
- return Signed ? RuntimeMD::KernelArg::I32 : RuntimeMD::KernelArg::U32;
- case 64:
- return Signed ? RuntimeMD::KernelArg::I64 : RuntimeMD::KernelArg::U64;
- default:
- // Runtime does not recognize other integer types. Report as
- // struct type.
- return RuntimeMD::KernelArg::Struct;
- }
- } else
- return RuntimeMD::KernelArg::Struct;
-}
-
-void AMDGPUAsmPrinter::emitRuntimeMetadata(const Function &F) {
- if (!F.getMetadata("kernel_arg_type"))
- return;
-
- MCContext &Context = getObjFileLowering().getContext();
- OutStreamer->SwitchSection(
- Context.getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0));
- OutStreamer->EmitIntValue(RuntimeMD::KeyKernelBegin, 1);
- emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyKernelName, F.getName());
-
- for (auto &Arg:F.args()) {
- // Emit KeyArgBegin.
- unsigned I = Arg.getArgNo();
- OutStreamer->EmitIntValue(RuntimeMD::KeyArgBegin, 1);
-
- // Emit KeyArgSize and KeyArgAlign.
- auto T = Arg.getType();
- auto DL = F.getParent()->getDataLayout();
- emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgSize,
- DL.getTypeAllocSize(T), 4);
- emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAlign,
- DL.getABITypeAlignment(T), 4);
-
- // Emit KeyArgTypeName.
- auto TypeName = dyn_cast<MDString>(F.getMetadata(
- "kernel_arg_type")->getOperand(I))->getString();
- emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgTypeName, TypeName);
-
- // Emit KeyArgName.
- if (auto ArgNameMD = F.getMetadata("kernel_arg_name")) {
- auto ArgName = cast<MDString>(ArgNameMD->getOperand(
- I))->getString();
- emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgName, ArgName);
- }
-
- // Emit KeyArgIsVolatile, KeyArgIsRestrict, KeyArgIsConst and KeyArgIsPipe.
- auto TypeQual = cast<MDString>(F.getMetadata(
- "kernel_arg_type_qual")->getOperand(I))->getString();
- SmallVector<StringRef, 1> SplitQ;
- TypeQual.split(SplitQ, " ", -1, false/* drop empty entry*/);
- for (auto &I:SplitQ) {
- auto Key = StringSwitch<RuntimeMD::Key>(I)
- .Case("volatile", RuntimeMD::KeyArgIsVolatile)
- .Case("restrict", RuntimeMD::KeyArgIsRestrict)
- .Case("const", RuntimeMD::KeyArgIsConst)
- .Case("pipe", RuntimeMD::KeyArgIsPipe)
- .Default(RuntimeMD::KeyNull);
- OutStreamer->EmitIntValue(Key, 1);
- }
-
- // Emit KeyArgTypeKind.
- auto BaseTypeName = cast<MDString>(
- F.getMetadata("kernel_arg_base_type")->getOperand(I))->getString();
- auto TypeKind = StringSwitch<RuntimeMD::KernelArg::TypeKind>(BaseTypeName)
- .Case("sampler_t", RuntimeMD::KernelArg::Sampler)
- .Case("queue_t", RuntimeMD::KernelArg::Queue)
- .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t",
- "image2d_t" , "image2d_array_t", RuntimeMD::KernelArg::Image)
- .Cases("image2d_depth_t", "image2d_array_depth_t",
- "image2d_msaa_t", "image2d_array_msaa_t",
- "image2d_msaa_depth_t", RuntimeMD::KernelArg::Image)
- .Cases("image2d_array_msaa_depth_t", "image3d_t",
- RuntimeMD::KernelArg::Image)
- .Default(isa<PointerType>(T) ? RuntimeMD::KernelArg::Pointer :
- RuntimeMD::KernelArg::Value);
- emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgTypeKind, TypeKind, 1);
-
- // Emit KeyArgValueType.
- emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgValueType,
- getRuntimeMDValueType(T, BaseTypeName), 2);
-
- // Emit KeyArgAccQual.
- auto AccQual = cast<MDString>(F.getMetadata(
- "kernel_arg_access_qual")->getOperand(I))->getString();
- auto AQ = StringSwitch<RuntimeMD::KernelArg::AccessQualifer>(AccQual)
- .Case("read_only", RuntimeMD::KernelArg::ReadOnly)
- .Case("write_only", RuntimeMD::KernelArg::WriteOnly)
- .Case("read_write", RuntimeMD::KernelArg::ReadWrite)
- .Default(RuntimeMD::KernelArg::None);
- emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAccQual,
- AQ, 1);
-
- // Emit KeyArgAddrQual.
- if (isa<PointerType>(T))
- emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAddrQual,
- T->getPointerAddressSpace(), 1);
-
- // Emit KeyArgEnd
- OutStreamer->EmitIntValue(RuntimeMD::KeyArgEnd, 1);
- }
-
- // Emit KeyReqdWorkGroupSize, KeyWorkGroupSizeHint, and KeyVecTypeHint.
- if (auto RWGS = F.getMetadata("reqd_work_group_size"))
- emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyReqdWorkGroupSize,
- RWGS, 4);
- if (auto WGSH = F.getMetadata("work_group_size_hint"))
- emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyWorkGroupSizeHint,
- WGSH, 4);
- if (auto VTH = F.getMetadata("vec_type_hint")) {
- auto TypeName = getOCLTypeName(cast<ValueAsMetadata>(
- VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>(
- VTH->getOperand(1))->getZExtValue());
- emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyVecTypeHint,
- TypeName);
- }
-
- // Emit KeyKernelEnd
- OutStreamer->EmitIntValue(RuntimeMD::KeyKernelEnd, 1);
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 7b04c53..9a4bafe 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -15,10 +15,13 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
+#include "AMDGPUMCInstLower.h"
+
#include "llvm/CodeGen/AsmPrinter.h"
#include <vector>
namespace llvm {
+class MCOperand;
class AMDGPUAsmPrinter final : public AsmPrinter {
private:
@@ -40,6 +43,8 @@ private:
NumVGPR(0),
NumSGPR(0),
FlatUsed(false),
+ NumSGPRsForWavesPerEU(0),
+ NumVGPRsForWavesPerEU(0),
ReservedVGPRFirst(0),
ReservedVGPRCount(0),
DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1),
@@ -71,15 +76,23 @@ private:
uint32_t LDSSize;
bool FlatUsed;
+ // Number of SGPRs that meets number of waves per execution unit request.
+ uint32_t NumSGPRsForWavesPerEU;
+
+ // Number of VGPRs that meets number of waves per execution unit request.
+ uint32_t NumVGPRsForWavesPerEU;
+
// If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
// fixed VGPR number reserved.
uint16_t ReservedVGPRFirst;
+
// The number of consecutive VGPRs reserved.
uint16_t ReservedVGPRCount;
// Fixed SGPR number used to hold wave scratch offset for entire kernel
// execution, or uint16_t(-1) if the register is not used or not known.
uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR;
+
// Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
// kernel execution, or uint16_t(-1) if the register is not used or not
// known.
@@ -108,9 +121,16 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
- return "AMDGPU Assembly Printer";
- }
+ StringRef getPassName() const override;
+
+ /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
+ /// pseudo lowering.
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+
+ /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo
+ /// instructions.
+ bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+ const MachineInstr *MI);
/// Implemented in AMDGPUMCInstLower.cpp
void EmitInstruction(const MachineInstr *MI) override;
@@ -123,14 +143,13 @@ public:
void EmitStartOfAsmFile(Module &M) override;
+ bool isBlockOnlyReachableByFallthrough(
+ const MachineBasicBlock *MBB) const override;
+
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
unsigned AsmVariant, const char *ExtraCode,
raw_ostream &O) override;
- void emitStartOfRuntimeMetadata(const Module &M);
-
- void emitRuntimeMetadata(const Function &F);
-
protected:
std::vector<std::string> DisasmLines, HexLines;
size_t DisasmLineMaxLen;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 1a1da8a..d53cc15 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering ---===//
+//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
//
// The LLVM Compiler Infrastructure
//
@@ -34,9 +34,9 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
return true;
}
-bool AMDGPUCallLowering::lowerFormalArguments(
- MachineIRBuilder &MIRBuilder, const Function::ArgumentListType &Args,
- const SmallVectorImpl<unsigned> &VRegs) const {
+bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<unsigned> VRegs) const {
// TODO: Implement once there are generic loads/stores.
return true;
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 61174ba..9ae87c9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -27,10 +27,8 @@ class AMDGPUCallLowering: public CallLowering {
bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
unsigned VReg) const override;
- bool
- lowerFormalArguments(MachineIRBuilder &MIRBuilder,
- const Function::ArgumentListType &Args,
- const SmallVectorImpl<unsigned> &VRegs) const override;
+ bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<unsigned> VRegs) const override;
};
} // End of namespace llvm;
#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index b955e23..e623054 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -39,6 +39,78 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
Module *Mod;
bool HasUnsafeFPMath;
+ /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
+ /// binary operation \p V.
+ ///
+ /// \returns Binary operation \p V.
+ Value *copyFlags(const BinaryOperator &I, Value *V) const;
+
+ /// \returns \p T's base element bit width.
+ unsigned getBaseElementBitWidth(const Type *T) const;
+
+ /// \returns Equivalent 32 bit integer type for given type \p T. For example,
+ /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
+ /// is returned.
+ Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
+
+ /// \returns True if binary operation \p I is a signed binary operation, false
+ /// otherwise.
+ bool isSigned(const BinaryOperator &I) const;
+
+ /// \returns True if the condition of 'select' operation \p I comes from a
+ /// signed 'icmp' operation, false otherwise.
+ bool isSigned(const SelectInst &I) const;
+
+ /// \returns True if type \p T needs to be promoted to 32 bit integer type,
+ /// false otherwise.
+ bool needsPromotionToI32(const Type *T) const;
+
+ /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
+ /// operation.
+ ///
+ /// \details \p I's base element bit width must be greater than 1 and less
+ /// than or equal 16. Promotion is done by sign or zero extending operands to
+ /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
+ /// truncating the result of 32 bit binary operation back to \p I's original
+ /// type. Division operation is not promoted.
+ ///
+ /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
+ /// false otherwise.
+ bool promoteUniformOpToI32(BinaryOperator &I) const;
+
+ /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
+ ///
+ /// \details \p I's base element bit width must be greater than 1 and less
+ /// than or equal 16. Promotion is done by sign or zero extending operands to
+ /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
+ ///
+ /// \returns True.
+ bool promoteUniformOpToI32(ICmpInst &I) const;
+
+ /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
+ /// operation.
+ ///
+ /// \details \p I's base element bit width must be greater than 1 and less
+ /// than or equal 16. Promotion is done by sign or zero extending operands to
+ /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
+ /// result of 32 bit 'select' operation back to \p I's original type.
+ ///
+ /// \returns True.
+ bool promoteUniformOpToI32(SelectInst &I) const;
+
+ /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
+ /// intrinsic.
+ ///
+ /// \details \p I's base element bit width must be greater than 1 and less
+ /// than or equal 16. Promotion is done by zero extending the operand to 32
+ /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
+ /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
+ /// shift amount is 32 minus \p I's base element bit width), and truncating
+ /// the result of the shift operation back to \p I's original type.
+ ///
+ /// \returns True.
+ bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
+
public:
static char ID;
AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
@@ -51,16 +123,18 @@ public:
bool visitFDiv(BinaryOperator &I);
- bool visitInstruction(Instruction &I) {
- return false;
- }
+ bool visitInstruction(Instruction &I) { return false; }
+ bool visitBinaryOperator(BinaryOperator &I);
+ bool visitICmpInst(ICmpInst &I);
+ bool visitSelectInst(SelectInst &I);
+
+ bool visitIntrinsicInst(IntrinsicInst &I);
+ bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;
- const char *getPassName() const override {
- return "AMDGPU IR optimizations";
- }
+ StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DivergenceAnalysis>();
@@ -70,6 +144,171 @@ public:
} // End anonymous namespace
+Value *AMDGPUCodeGenPrepare::copyFlags(
+ const BinaryOperator &I, Value *V) const {
+ BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
+ if (!BinOp) // Possibly constant expression.
+ return V;
+
+ if (isa<OverflowingBinaryOperator>(BinOp)) {
+ BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
+ BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+ } else if (isa<PossiblyExactOperator>(BinOp))
+ BinOp->setIsExact(I.isExact());
+
+ return V;
+}
+
+unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
+ assert(needsPromotionToI32(T) && "T does not need promotion to i32");
+
+ if (T->isIntegerTy())
+ return T->getIntegerBitWidth();
+ return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
+}
+
+Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
+ assert(needsPromotionToI32(T) && "T does not need promotion to i32");
+
+ if (T->isIntegerTy())
+ return B.getInt32Ty();
+ return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
+}
+
+bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
+ return I.getOpcode() == Instruction::AShr ||
+ I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
+}
+
+bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
+ return isa<ICmpInst>(I.getOperand(0)) ?
+ cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
+}
+
+bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
+ if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 &&
+ T->getIntegerBitWidth() <= 16)
+ return true;
+ if (!T->isVectorTy())
+ return false;
+ return needsPromotionToI32(cast<VectorType>(T)->getElementType());
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
+ assert(needsPromotionToI32(I.getType()) &&
+ "I does not need promotion to i32");
+
+ if (I.getOpcode() == Instruction::SDiv ||
+ I.getOpcode() == Instruction::UDiv)
+ return false;
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = getI32Ty(Builder, I.getType());
+ Value *ExtOp0 = nullptr;
+ Value *ExtOp1 = nullptr;
+ Value *ExtRes = nullptr;
+ Value *TruncRes = nullptr;
+
+ if (isSigned(I)) {
+ ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
+ ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+ } else {
+ ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
+ ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+ }
+ ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
+ TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
+
+ I.replaceAllUsesWith(TruncRes);
+ I.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
+ assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
+ "I does not need promotion to i32");
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
+ Value *ExtOp0 = nullptr;
+ Value *ExtOp1 = nullptr;
+ Value *NewICmp = nullptr;
+
+ if (I.isSigned()) {
+ ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
+ ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+ } else {
+ ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
+ ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+ }
+ NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
+
+ I.replaceAllUsesWith(NewICmp);
+ I.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
+ assert(needsPromotionToI32(I.getType()) &&
+ "I does not need promotion to i32");
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = getI32Ty(Builder, I.getType());
+ Value *ExtOp1 = nullptr;
+ Value *ExtOp2 = nullptr;
+ Value *ExtRes = nullptr;
+ Value *TruncRes = nullptr;
+
+ if (isSigned(I)) {
+ ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+ ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
+ } else {
+ ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+ ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
+ }
+ ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
+ TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
+
+ I.replaceAllUsesWith(TruncRes);
+ I.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
+ IntrinsicInst &I) const {
+ assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
+ "I must be bitreverse intrinsic");
+ assert(needsPromotionToI32(I.getType()) &&
+ "I does not need promotion to i32");
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = getI32Ty(Builder, I.getType());
+ Function *I32 =
+ Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
+ Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
+ Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
+ Value *LShrOp =
+ Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
+ Value *TruncRes =
+ Builder.CreateTrunc(LShrOp, I.getType());
+
+ I.replaceAllUsesWith(TruncRes);
+ I.eraseFromParent();
+
+ return true;
+}
+
static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
if (!CNum)
@@ -85,7 +324,6 @@ static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
Type *Ty = FDiv.getType();
- // TODO: Handle half
if (!Ty->getScalarType()->isFloatTy())
return false;
@@ -154,6 +392,55 @@ static bool hasUnsafeFPMath(const Function &F) {
return Attr.getValueAsString() == "true";
}
+bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+ bool Changed = false;
+
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+ DA->isUniform(&I))
+ Changed |= promoteUniformOpToI32(I);
+
+ return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
+ bool Changed = false;
+
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
+ DA->isUniform(&I))
+ Changed |= promoteUniformOpToI32(I);
+
+ return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
+ bool Changed = false;
+
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+ DA->isUniform(&I))
+ Changed |= promoteUniformOpToI32(I);
+
+ return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
+ switch (I.getIntrinsicID()) {
+ case Intrinsic::bitreverse:
+ return visitBitreverseIntrinsicInst(I);
+ default:
+ return false;
+ }
+}
+
+bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
+ bool Changed = false;
+
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+ DA->isUniform(&I))
+ Changed |= promoteUniformBitreverseToI32(I);
+
+ return Changed;
+}
+
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Mod = &M;
return false;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index bbc28b8..805fb71 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -10,23 +10,22 @@
// Interface to describe a layout of a stack frame on a AMDGPU target machine.
//
//===----------------------------------------------------------------------===//
+
#include "AMDGPUFrameLowering.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
-
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Instructions.h"
+#include "llvm/Support/MathExtras.h"
using namespace llvm;
AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
int LAO, unsigned TransAl)
: TargetFrameLowering(D, StackAl, LAO, TransAl) { }
-AMDGPUFrameLowering::~AMDGPUFrameLowering() { }
+AMDGPUFrameLowering::~AMDGPUFrameLowering() = default;
unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
-
// XXX: Hardcoding to 1 for now.
//
// I think the StackWidth should stored as metadata associated with the
@@ -75,7 +74,7 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
int FI,
unsigned &FrameReg) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
const AMDGPURegisterInfo *RI
= MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo();
@@ -86,19 +85,18 @@ int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
// XXX: We should only do this when the shader actually uses this
// information.
unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
- int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
+ int UpperBound = FI == -1 ? MFI.getNumObjects() : FI;
- for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
- OffsetBytes = alignTo(OffsetBytes, MFI->getObjectAlignment(i));
- OffsetBytes += MFI->getObjectSize(i);
+ for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) {
+ OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i));
+ OffsetBytes += MFI.getObjectSize(i);
// Each register holds 4 bytes, so we must always align the offset to at
// least 4 bytes, so that 2 frame objects won't share the same register.
OffsetBytes = alignTo(OffsetBytes, 4);
}
if (FI != -1)
- OffsetBytes = alignTo(OffsetBytes, MFI->getObjectAlignment(FI));
+ OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI));
return OffsetBytes / (getStackWidth(MF) * 4);
}
-
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 513848a..5d51351 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -11,6 +11,7 @@
/// \brief Interface to describe a layout of a stack frame on an AMDGPU target.
//
//===----------------------------------------------------------------------===//
+
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
@@ -27,7 +28,7 @@ class AMDGPUFrameLowering : public TargetFrameLowering {
public:
AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
unsigned TransAl = 1);
- virtual ~AMDGPUFrameLowering();
+ ~AMDGPUFrameLowering() override;
/// \returns The number of 32-bit sub-registers that are used when storing
/// values to the stack.
@@ -40,5 +41,7 @@ public:
return false;
}
};
-} // namespace llvm
-#endif
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 23c9352..5bf347e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -12,25 +12,48 @@
//
//===----------------------------------------------------------------------===//
+#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
-#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPURegisterInfo.h"
#include "AMDGPUISelLowering.h" // For AMDGPUISD
#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <new>
+#include <vector>
using namespace llvm;
namespace llvm {
+
class R600InstrInfo;
-}
+
+} // end namespace llvm
//===----------------------------------------------------------------------===//
// Instruction Selector Implementation
@@ -38,18 +61,6 @@ class R600InstrInfo;
namespace {
-static bool isCBranchSCC(const SDNode *N) {
- assert(N->getOpcode() == ISD::BRCOND);
- if (!N->hasOneUse())
- return false;
-
- SDValue Cond = N->getOperand(1);
- if (Cond.getOpcode() == ISD::CopyToReg)
- Cond = Cond.getOperand(2);
- return Cond.getOpcode() == ISD::SETCC &&
- Cond.getOperand(0).getValueType() == MVT::i32 && Cond.hasOneUse();
-}
-
/// AMDGPU specific code to select AMDGPU machine instructions for
/// SelectionDAG operations.
class AMDGPUDAGToDAGISel : public SelectionDAGISel {
@@ -58,16 +69,18 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
const AMDGPUSubtarget *Subtarget;
public:
- AMDGPUDAGToDAGISel(TargetMachine &TM);
- virtual ~AMDGPUDAGToDAGISel();
+ explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(TM, OptLevel) {}
+ ~AMDGPUDAGToDAGISel() override = default;
+
bool runOnMachineFunction(MachineFunction &MF) override;
void Select(SDNode *N) override;
- const char *getPassName() const override;
- void PreprocessISelDAG() override;
+ StringRef getPassName() const override;
void PostprocessISelDAG() override;
private:
- bool isInlineImmediate(SDNode *N) const;
+ SDValue foldFrameIndex(SDValue N) const;
+ bool isInlineImmediate(const SDNode *N) const;
bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
const R600InstrInfo *TII);
bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
@@ -145,40 +158,46 @@ private:
void SelectADD_SUB_I64(SDNode *N);
void SelectDIV_SCALE(SDNode *N);
+ void SelectFMA_W_CHAIN(SDNode *N);
+ void SelectFMUL_W_CHAIN(SDNode *N);
SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
uint32_t Offset, uint32_t Width);
void SelectS_BFEFromShifts(SDNode *N);
void SelectS_BFE(SDNode *N);
+ bool isCBranchSCC(const SDNode *N) const;
void SelectBRCOND(SDNode *N);
void SelectATOMIC_CMP_SWAP(SDNode *N);
// Include the pieces autogenerated from the target description.
#include "AMDGPUGenDAGISel.inc"
};
+
} // end anonymous namespace
/// \brief This pass converts a legalized DAG into a AMDGPU-specific
// DAG, ready for instruction scheduling.
-FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) {
- return new AMDGPUDAGToDAGISel(TM);
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new AMDGPUDAGToDAGISel(TM, OptLevel);
}
-AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
- : SelectionDAGISel(TM) {}
-
bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
Subtarget = &MF.getSubtarget<AMDGPUSubtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
}
-AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
-}
+bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
+ const SIInstrInfo *TII
+ = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo();
+
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+ return TII->isInlineConstant(C->getAPIntValue());
+
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+ return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
-bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const {
- const SITargetLowering *TL
- = static_cast<const SITargetLowering *>(getTargetLowering());
- return TL->analyzeImmediate(N) == 0;
+ return false;
}
/// \brief Determine the register class for \p OpNo
@@ -187,8 +206,21 @@ bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const {
/// determined.
const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
unsigned OpNo) const {
- if (!N->isMachineOpcode())
+ if (!N->isMachineOpcode()) {
+ if (N->getOpcode() == ISD::CopyToReg) {
+ unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
+ return MRI.getRegClass(Reg);
+ }
+
+ const SIRegisterInfo *TRI
+ = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+ return TRI->getPhysRegClass(Reg);
+ }
+
return nullptr;
+ }
switch (N->getMachineOpcode()) {
default: {
@@ -244,7 +276,7 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
switch (NumVectorElts) {
case 1:
- return AMDGPU::SReg_32RegClassID;
+ return AMDGPU::SReg_32_XM0RegClassID;
case 2:
return AMDGPU::SReg_64RegClassID;
case 4:
@@ -275,7 +307,11 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
// DAG legalization, so we can fold some i64 ADDs used for address
// calculation into the LOAD and STORE instructions.
case ISD::ADD:
- case ISD::SUB: {
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::SUB:
+ case ISD::SUBC:
+ case ISD::SUBE: {
if (N->getValueType(0) != MVT::i64 ||
Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
break;
@@ -283,6 +319,15 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectADD_SUB_I64(N);
return;
}
+ case AMDGPUISD::FMUL_W_CHAIN: {
+ SelectFMUL_W_CHAIN(N);
+ return;
+ }
+ case AMDGPUISD::FMA_W_CHAIN: {
+ SelectFMA_W_CHAIN(N);
+ return;
+ }
+
case ISD::SCALAR_TO_VECTOR:
case AMDGPUISD::BUILD_VERTICAL_VECTOR:
case ISD::BUILD_VECTOR: {
@@ -498,7 +543,7 @@ bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
Term->getMetadata("structurizecfg.uniform");
}
-const char *AMDGPUDAGToDAGISel::getPassName() const {
+StringRef AMDGPUDAGToDAGISel::getPassName() const {
return "AMDGPU DAG->DAG Pattern Instruction Selection";
}
@@ -563,6 +608,10 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
if ((C = dyn_cast<ConstantSDNode>(Addr))) {
Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
+ (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
+ Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
Base = Addr.getOperand(0);
@@ -580,7 +629,12 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
- bool IsAdd = (N->getOpcode() == ISD::ADD);
+ unsigned Opcode = N->getOpcode();
+ bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
+ bool ProduceCarry =
+ ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
+ bool IsAdd =
+ (Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE);
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
@@ -596,25 +650,70 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
DL, MVT::i32, RHS, Sub1);
SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
- SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
- SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs);
- SDValue Carry(AddLo, 1);
- SDNode *AddHi
- = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32,
- SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
+ SDNode *AddLo;
+ if (!ConsumeCarry) {
+ SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
+ AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
+ } else {
+ SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
+ AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
+ }
+ SDValue AddHiArgs[] = {
+ SDValue(Hi0, 0),
+ SDValue(Hi1, 0),
+ SDValue(AddLo, 1)
+ };
+ SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
- SDValue Args[5] = {
+ SDValue RegSequenceArgs[] = {
CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
SDValue(AddLo,0),
Sub0,
SDValue(AddHi,0),
Sub1,
};
- CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
+ SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::i64, RegSequenceArgs);
+
+ if (ProduceCarry) {
+ // Replace the carry-use
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1));
+ }
+
+ // Replace the remaining uses.
+ CurDAG->ReplaceAllUsesWith(N, RegSequence);
+ CurDAG->RemoveDeadNode(N);
+}
+
+void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
+ SDLoc SL(N);
+ // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+ SDValue Ops[10];
+
+ SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
+ SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+ SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
+ Ops[8] = N->getOperand(0);
+ Ops[9] = N->getOperand(4);
+
+ CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops);
+}
+
+void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
+ SDLoc SL(N);
+ // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
+ SDValue Ops[8];
+
+ SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
+ SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+ Ops[6] = N->getOperand(0);
+ Ops[7] = N->getOperand(3);
+
+ CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
}
// We need to handle this here because tablegen doesn't support matching
@@ -628,14 +727,8 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
unsigned Opc
= (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
- // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
- // omod
- SDValue Ops[8];
-
- SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
- SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
- SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
- CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+ CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
@@ -779,6 +872,9 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
}
// default case
+
+ // FIXME: This is broken on SI where we still need to check if the base
+ // pointer is positive here.
Base = Addr;
Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
@@ -825,7 +921,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
Ptr = N2;
VAddr = N3;
} else {
-
// (add N0, C1) -> offset
VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
Ptr = N0;
@@ -903,6 +998,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
}
+SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+ if (auto FI = dyn_cast<FrameIndexSDNode>(N))
+ return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+ return N;
+}
+
bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
SDValue &VAddr, SDValue &SOffset,
SDValue &ImmOffset) const {
@@ -922,14 +1023,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
// Offsets in vaddr must be positive.
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
if (isLegalMUBUFImmOffset(C1)) {
- VAddr = N0;
+ VAddr = foldFrameIndex(N0);
ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
return true;
}
}
// (node)
- VAddr = Addr;
+ VAddr = foldFrameIndex(Addr);
ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
return true;
}
@@ -1122,7 +1223,6 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
SDValue &Offset, bool &Imm) const {
-
SDLoc SL(Addr);
if (CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N0 = Addr.getOperand(0);
@@ -1327,36 +1427,53 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
SelectCode(N);
}
+bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
+ assert(N->getOpcode() == ISD::BRCOND);
+ if (!N->hasOneUse())
+ return false;
+
+ SDValue Cond = N->getOperand(1);
+ if (Cond.getOpcode() == ISD::CopyToReg)
+ Cond = Cond.getOperand(2);
+
+ if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
+ return false;
+
+ MVT VT = Cond.getOperand(0).getSimpleValueType();
+ if (VT == MVT::i32)
+ return true;
+
+ if (VT == MVT::i64) {
+ auto ST = static_cast<const SISubtarget *>(Subtarget);
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
+ }
+
+ return false;
+}
+
void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
SDValue Cond = N->getOperand(1);
+ if (Cond.isUndef()) {
+ CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
+ N->getOperand(2), N->getOperand(0));
+ return;
+ }
+
if (isCBranchSCC(N)) {
// This brcond will use S_CBRANCH_SCC*, so let tablegen handle it.
SelectCode(N);
return;
}
- // The result of VOPC instructions is or'd against ~EXEC before it is
- // written to vcc or another SGPR. This means that the value '1' is always
- // written to the corresponding bit for results that are masked. In order
- // to correctly check against vccz, we need to and VCC with the EXEC
- // register in order to clear the value from the masked bits.
-
SDLoc SL(N);
- SDNode *MaskedCond =
- CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1,
- CurDAG->getRegister(AMDGPU::EXEC, MVT::i1),
- Cond);
- SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC,
- SDValue(MaskedCond, 0),
- SDValue()); // Passing SDValue() adds a
- // glue output.
+ SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, Cond);
CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other,
N->getOperand(2), // Basic Block
- VCC.getValue(0), // Chain
- VCC.getValue(1)); // Glue
- return;
+ VCC.getValue(0));
}
// This is here because there isn't a way to use the generated sub0_sub1 as the
@@ -1427,7 +1544,6 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
-
unsigned Mods = 0;
Src = In;
@@ -1491,62 +1607,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
return SelectVOP3Mods(In, Src, SrcMods);
}
-void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
- MachineFrameInfo *MFI = CurDAG->getMachineFunction().getFrameInfo();
-
- // Handle the perverse case where a frame index is being stored. We don't
- // want to see multiple frame index operands on the same instruction since
- // it complicates things and violates some assumptions about frame index
- // lowering.
- for (int I = MFI->getObjectIndexBegin(), E = MFI->getObjectIndexEnd();
- I != E; ++I) {
- SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32);
-
- // It's possible that we have a frame index defined in the function that
- // isn't used in this block.
- if (FI.use_empty())
- continue;
-
- // Skip over the AssertZext inserted during lowering.
- SDValue EffectiveFI = FI;
- auto It = FI->use_begin();
- if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) {
- EffectiveFI = SDValue(*It, 0);
- It = EffectiveFI->use_begin();
- }
-
- for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) {
- SDUse &Use = It.getUse();
- SDNode *User = Use.getUser();
- unsigned OpIdx = It.getOperandNo();
- ++It;
-
- if (MemSDNode *M = dyn_cast<MemSDNode>(User)) {
- unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1;
- if (OpIdx == PtrIdx)
- continue;
-
- unsigned OpN = M->getNumOperands();
- SDValue NewOps[8];
-
- assert(OpN < array_lengthof(NewOps));
- for (unsigned Op = 0; Op != OpN; ++Op) {
- if (Op != OpIdx) {
- NewOps[Op] = M->getOperand(Op);
- continue;
- }
-
- MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
- SDLoc(M), MVT::i32, FI);
- NewOps[Op] = SDValue(Mov, 0);
- }
-
- CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN));
- }
- }
- }
-}
-
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 352423ed..54caa2c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -37,7 +37,7 @@ static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
MachineFunction &MF = State.getMachineFunction();
AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
- uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(),
+ uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
ArgFlags.getOrigAlign());
State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
return true;
@@ -55,14 +55,6 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
}
-EVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) {
- unsigned StoreSize = VT.getStoreSizeInBits();
- if (StoreSize <= 32)
- return EVT::getIntegerVT(Ctx, StoreSize);
-
- return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
-}
-
AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
const AMDGPUSubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -180,16 +172,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
- setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
- setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
-
- setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
- setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
-
- setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
- setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
- setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
-
setTruncStoreAction(MVT::i64, MVT::i1, Expand);
setTruncStoreAction(MVT::i64, MVT::i8, Expand);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
@@ -287,6 +269,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
}
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
for (MVT VT : ScalarIntVTs) {
@@ -367,6 +350,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, VT, Expand);
setOperationAction(ISD::FP_TO_UINT, VT, Expand);
setOperationAction(ISD::MUL, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::OR, VT, Expand);
setOperationAction(ISD::SHL, VT, Expand);
setOperationAction(ISD::SRA, VT, Expand);
@@ -440,22 +425,31 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
+ // There are no libcalls of any kind.
+ for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
+ setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
+
setBooleanContents(ZeroOrNegativeOneBooleanContent);
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
setSchedulingPreference(Sched::RegPressure);
setJumpIsExpensive(true);
+ // FIXME: This is only partially true. If we have to do vector compares, any
+ // SGPR pair can be a condition register. If we have a uniform condition, we
+ // are better off doing SALU operations, where there is only one SCC. For now,
+ // we don't have a way of knowing during instruction selection if a condition
+ // will be uniform and we always use vector compares. Assume we are using
+ // vector compares until that is fixed.
+ setHasMultipleConditionRegisters(true);
+
// SI at least has hardware support for floating point exceptions, but no way
// of using or handling them is implemented. They are also optional in OpenCL
// (Section 7.3)
setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
- setSelectIsExpensive(false);
PredictableSelectIsExpensive = false;
- setFsqrtIsCheap(true);
-
// We want to find all load dependencies for long chains of stores to enable
// merging into very wide vectors. The problem is with vectors with > 4
// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
@@ -472,22 +466,42 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MaxStoresPerMemset = 4096;
setTargetDAGCombine(ISD::BITCAST);
- setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::MULHU);
+ setTargetDAGCombine(ISD::MULHS);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FNEG);
}
//===----------------------------------------------------------------------===//
// Target Information
//===----------------------------------------------------------------------===//
+static bool fnegFoldsIntoOp(unsigned Opc) {
+ switch (Opc) {
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FMA:
+ case ISD::FMAD:
+ case ISD::FSIN:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::FMUL_LEGACY:
+ return true;
+ default:
+ return false;
+ }
+}
+
MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
return MVT::i32;
}
@@ -500,7 +514,8 @@ bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
// FIXME: Why are we reporting vectors of FP immediates as legal?
bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
EVT ScalarVT = VT.getScalarType();
- return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
+ return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
+ (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
}
// We don't want to shrink f64 / f32 constants.
@@ -565,12 +580,12 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
assert(VT.isFloatingPoint());
- return VT == MVT::f32 || VT == MVT::f64;
+ return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
+ VT == MVT::f16);
}
bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
- assert(VT.isFloatingPoint());
- return VT == MVT::f32 || VT == MVT::f64;
+ return isFAbsFree(VT);
}
bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
@@ -593,19 +608,32 @@ bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const
bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
// Truncate is just accessing a subregister.
- return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
+
+ unsigned SrcSize = Source.getSizeInBits();
+ unsigned DestSize = Dest.getSizeInBits();
+
+ return DestSize < SrcSize && DestSize % 32 == 0 ;
}
bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
// Truncate is just accessing a subregister.
- return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
- (Dest->getPrimitiveSizeInBits() % 32 == 0);
+
+ unsigned SrcSize = Source->getScalarSizeInBits();
+ unsigned DestSize = Dest->getScalarSizeInBits();
+
+ if (DestSize== 16 && Subtarget->has16BitInsts())
+ return SrcSize >= 32;
+
+ return DestSize < SrcSize && DestSize % 32 == 0;
}
bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
unsigned SrcSize = Src->getScalarSizeInBits();
unsigned DestSize = Dest->getScalarSizeInBits();
+ if (SrcSize == 16 && Subtarget->has16BitInsts())
+ return DestSize >= 32;
+
return SrcSize == 32 && DestSize == 64;
}
@@ -614,6 +642,10 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
// practical purposes, the extra mov 0 to load a 64-bit is free. As used,
// this will enable reducing 64-bit operations the 32-bit, which is always
// good.
+
+ if (Src == MVT::i16)
+ return Dest == MVT::i32 ||Dest == MVT::i64 ;
+
return Src == MVT::i32 && Dest == MVT::i64;
}
@@ -635,9 +667,105 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
// TargetLowering Callbacks
//===---------------------------------------------------------------------===//
-void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
+/// The SelectionDAGBuilder will automatically promote function arguments
+/// with illegal types. However, this does not work for the AMDGPU targets
+/// since the function arguments are stored in memory as these illegal types.
+/// In order to handle this properly we need to get the original types sizes
+/// from the LLVM IR Function and fixup the ISD:InputArg values before
+/// passing them to AnalyzeFormalArguments()
+
+/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
+/// input values across multiple registers. Each item in the Ins array
+/// represents a single value that will be stored in regsters. Ins[x].VT is
+/// the value type of the value that will be stored in the register, so
+/// whatever SDNode we lower the argument to needs to be this type.
+///
+/// In order to correctly lower the arguments we need to know the size of each
+/// argument. Since Ins[x].VT gives us the size of the register that will
+/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
+/// for the orignal function argument so that we can deduce the correct memory
+/// type to use for Ins[x]. In most cases the correct memory type will be
+/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
+/// we have a kernel argument of type v8i8, this argument will be split into
+/// 8 parts and each part will be represented by its own item in the Ins array.
+/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
+/// the argument before it was split. From this, we deduce that the memory type
+/// for each individual part is i8. We pass the memory type as LocVT to the
+/// calling convention analysis function and the register type (Ins[x].VT) as
+/// the ValVT.
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
const SmallVectorImpl<ISD::InputArg> &Ins) const {
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ const ISD::InputArg &In = Ins[i];
+ EVT MemVT;
+
+ unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
+
+ if (!Subtarget->isAmdHsaOS() &&
+ (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
+ // The ABI says the caller will extend these values to 32-bits.
+ MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
+ } else if (NumRegs == 1) {
+ // This argument is not split, so the IR type is the memory type.
+ assert(!In.Flags.isSplit());
+ if (In.ArgVT.isExtended()) {
+ // We have an extended type, like i24, so we should just use the register type
+ MemVT = In.VT;
+ } else {
+ MemVT = In.ArgVT;
+ }
+ } else if (In.ArgVT.isVector() && In.VT.isVector() &&
+ In.ArgVT.getScalarType() == In.VT.getScalarType()) {
+ assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
+ // We have a vector value which has been split into a vector with
+ // the same scalar type, but fewer elements. This should handle
+ // all the floating-point vector types.
+ MemVT = In.VT;
+ } else if (In.ArgVT.isVector() &&
+ In.ArgVT.getVectorNumElements() == NumRegs) {
+ // This arg has been split so that each element is stored in a separate
+ // register.
+ MemVT = In.ArgVT.getScalarType();
+ } else if (In.ArgVT.isExtended()) {
+ // We have an extended type, like i65.
+ MemVT = In.VT;
+ } else {
+ unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
+ assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
+ if (In.VT.isInteger()) {
+ MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
+ } else if (In.VT.isVector()) {
+ assert(!In.VT.getScalarType().isFloatingPoint());
+ unsigned NumElements = In.VT.getVectorNumElements();
+ assert(MemoryBits % NumElements == 0);
+ // This vector type has been split into another vector type with
+ // a different elements size.
+ EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
+ MemoryBits / NumElements);
+ MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+ } else {
+ llvm_unreachable("cannot deduce memory type.");
+ }
+ }
+ // Convert one element vectors to scalar.
+ if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
+ MemVT = MemVT.getScalarType();
+
+ if (MemVT.isExtended()) {
+ // This should really only happen if we have vec3 arguments
+ assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
+ MemVT = MemVT.getPow2VectorType(State.getContext());
+ }
+
+ assert(MemVT.isSimple());
+ allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
+ State);
+ }
+}
+
+void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const {
State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
}
@@ -678,8 +806,10 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
DAG.getContext()->diagnose(NoCalls);
- for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
- InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
+ if (!CLI.IsTailCall) {
+ for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
+ InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
+ }
return DAG.getEntryNode();
}
@@ -718,6 +848,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+ case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
case ISD::CTLZ:
@@ -745,94 +876,6 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
}
}
-// FIXME: This implements accesses to initialized globals in the constant
-// address space by copying them to private and accessing that. It does not
-// properly handle illegal types or vectors. The private vector loads are not
-// scalarized, and the illegal scalars hit an assertion. This technique will not
-// work well with large initializers, and this should eventually be
-// removed. Initialized globals should be placed into a data section that the
-// runtime will load into a buffer before the kernel is executed. Uses of the
-// global need to be replaced with a pointer loaded from an implicit kernel
-// argument into this buffer holding the copy of the data, which will remove the
-// need for any of this.
-SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
- const GlobalValue *GV,
- const SDValue &InitPtr,
- SDValue Chain,
- SelectionDAG &DAG) const {
- const DataLayout &TD = DAG.getDataLayout();
- SDLoc DL(InitPtr);
- Type *InitTy = Init->getType();
-
- if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
- EVT VT = EVT::getEVT(InitTy);
- PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
- return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr,
- MachinePointerInfo(UndefValue::get(PtrTy)),
- TD.getPrefTypeAlignment(InitTy));
- }
-
- if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
- EVT VT = EVT::getEVT(CFP->getType());
- PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
- return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr,
- MachinePointerInfo(UndefValue::get(PtrTy)),
- TD.getPrefTypeAlignment(CFP->getType()));
- }
-
- if (StructType *ST = dyn_cast<StructType>(InitTy)) {
- const StructLayout *SL = TD.getStructLayout(ST);
-
- EVT PtrVT = InitPtr.getValueType();
- SmallVector<SDValue, 8> Chains;
-
- for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
- SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT);
- SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
-
- Constant *Elt = Init->getAggregateElement(I);
- Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
- }
-
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
- }
-
- if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
- EVT PtrVT = InitPtr.getValueType();
-
- unsigned NumElements;
- if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
- NumElements = AT->getNumElements();
- else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
- NumElements = VT->getNumElements();
- else
- llvm_unreachable("Unexpected type");
-
- unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType());
- SmallVector<SDValue, 8> Chains;
- for (unsigned i = 0; i < NumElements; ++i) {
- SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT);
- SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
-
- Constant *Elt = Init->getAggregateElement(i);
- Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
- }
-
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
- }
-
- if (isa<UndefValue>(Init)) {
- EVT VT = EVT::getEVT(InitTy);
- PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
- return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
- MachinePointerInfo(UndefValue::get(PtrTy)),
- TD.getPrefTypeAlignment(InitTy));
- }
-
- Init->dump();
- llvm_unreachable("Unhandled constant initializer");
-}
-
static bool hasDefinedInitializer(const GlobalValue *GV) {
const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
if (!GVar || !GVar->hasInitializer())
@@ -850,11 +893,6 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
const GlobalValue *GV = G->getGlobal();
switch (G->getAddressSpace()) {
- case AMDGPUAS::CONSTANT_ADDRESS: {
- MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
- SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(G), ConstPtrVT);
- return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(G), ConstPtrVT, GA);
- }
case AMDGPUAS::LOCAL_ADDRESS: {
// XXX: What does the value of G->getOffset() mean?
assert(G->getOffset() == 0 &&
@@ -864,24 +902,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
if (hasDefinedInitializer(GV))
break;
- unsigned Offset;
- if (MFI->LocalMemoryObjects.count(GV) == 0) {
- unsigned Align = GV->getAlignment();
- if (Align == 0)
- Align = DL.getABITypeAlignment(GV->getValueType());
-
- /// TODO: We should sort these to minimize wasted space due to alignment
- /// padding. Currently the padding is decided by the first encountered use
- /// during lowering.
- Offset = MFI->LDSSize = alignTo(MFI->LDSSize, Align);
- MFI->LocalMemoryObjects[GV] = Offset;
- MFI->LDSSize += DL.getTypeAllocSize(GV->getValueType());
- } else {
- Offset = MFI->LocalMemoryObjects[GV];
- }
-
- return DAG.getConstant(Offset, SDLoc(Op),
- getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS));
+ unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
+ return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
}
}
@@ -1097,65 +1119,6 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
return DAG.getMergeValues(Ops, SL);
}
-// FIXME: This isn't doing anything for SI. This should be used in a target
-// combine during type legalization.
-SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
- SelectionDAG &DAG) const {
- StoreSDNode *Store = cast<StoreSDNode>(Op);
- EVT MemVT = Store->getMemoryVT();
- unsigned MemBits = MemVT.getSizeInBits();
-
- // Byte stores are really expensive, so if possible, try to pack 32-bit vector
- // truncating store into an i32 store.
- // XXX: We could also handle optimize other vector bitwidths.
- if (!MemVT.isVector() || MemBits > 32) {
- return SDValue();
- }
-
- SDLoc DL(Op);
- SDValue Value = Store->getValue();
- EVT VT = Value.getValueType();
- EVT ElemVT = VT.getVectorElementType();
- SDValue Ptr = Store->getBasePtr();
- EVT MemEltVT = MemVT.getVectorElementType();
- unsigned MemEltBits = MemEltVT.getSizeInBits();
- unsigned MemNumElements = MemVT.getVectorNumElements();
- unsigned PackedSize = MemVT.getStoreSizeInBits();
- SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32);
-
- assert(Value.getValueType().getScalarSizeInBits() >= 32);
-
- SDValue PackedValue;
- for (unsigned i = 0; i < MemNumElements; ++i) {
- SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
- DAG.getConstant(i, DL, MVT::i32));
- Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
- Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
-
- SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32);
- Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
-
- if (i == 0) {
- PackedValue = Elt;
- } else {
- PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
- }
- }
-
- if (PackedSize < 32) {
- EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
- return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
- Store->getMemOperand()->getPointerInfo(), PackedVT,
- Store->getAlignment(),
- Store->getMemOperand()->getFlags());
- }
-
- return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
- Store->getMemOperand()->getPointerInfo(),
- Store->getAlignment(),
- Store->getMemOperand()->getFlags());
-}
-
SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
SelectionDAG &DAG) const {
StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -1670,7 +1633,7 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType() == MVT::f64);
- APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
+ APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
@@ -1681,7 +1644,7 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
- APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
+ APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
EVT SetCCVT =
@@ -1988,14 +1951,26 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
assert(Op.getOperand(0).getValueType() == MVT::i64 &&
"operation should be legal");
+ // TODO: Factor out code common with LowerSINT_TO_FP.
+
EVT DestVT = Op.getValueType();
- if (DestVT == MVT::f64)
- return LowerINT_TO_FP64(Op, DAG, false);
+ if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
+ SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
+ SDValue FPRound =
+ DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
+
+ return FPRound;
+ }
if (DestVT == MVT::f32)
return LowerINT_TO_FP32(Op, DAG, false);
- return SDValue();
+ assert(DestVT == MVT::f64);
+ return LowerINT_TO_FP64(Op, DAG, false);
}
SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
@@ -2003,14 +1978,26 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
assert(Op.getOperand(0).getValueType() == MVT::i64 &&
"operation should be legal");
+ // TODO: Factor out code common with LowerUINT_TO_FP.
+
EVT DestVT = Op.getValueType();
+ if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
+ SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
+ SDValue FPRound =
+ DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
+
+ return FPRound;
+ }
+
if (DestVT == MVT::f32)
return LowerINT_TO_FP32(Op, DAG, true);
- if (DestVT == MVT::f64)
- return LowerINT_TO_FP64(Op, DAG, true);
-
- return SDValue();
+ assert(DestVT == MVT::f64);
+ return LowerINT_TO_FP64(Op, DAG, true);
}
SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
@@ -2042,10 +2029,118 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
}
+SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
+
+ if (getTargetMachine().Options.UnsafeFPMath) {
+ // There is a generic expand for FP_TO_FP16 with unsafe fast math.
+ return SDValue();
+ }
+
+ SDLoc DL(Op);
+ SDValue N0 = Op.getOperand(0);
+ assert (N0.getSimpleValueType() == MVT::f64);
+
+ // f64 -> f16 conversion using round-to-nearest-even rounding mode.
+ const unsigned ExpMask = 0x7ff;
+ const unsigned ExpBiasf64 = 1023;
+ const unsigned ExpBiasf16 = 15;
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+ SDValue One = DAG.getConstant(1, DL, MVT::i32);
+ SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
+ SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
+ DAG.getConstant(32, DL, MVT::i64));
+ UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
+ U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
+ SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
+ DAG.getConstant(20, DL, MVT::i64));
+ E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
+ DAG.getConstant(ExpMask, DL, MVT::i32));
+ // Subtract the fp64 exponent bias (1023) to get the real exponent and
+ // add the f16 bias (15) to get the biased exponent for the f16 format.
+ E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
+ DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
+
+ SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
+ DAG.getConstant(8, DL, MVT::i32));
+ M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
+ DAG.getConstant(0xffe, DL, MVT::i32));
+
+ SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
+ DAG.getConstant(0x1ff, DL, MVT::i32));
+ MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
+
+ SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
+ M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
+
+ // (M != 0 ? 0x0200 : 0) | 0x7c00;
+ SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
+ DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
+ Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
+
+ // N = M | (E << 12);
+ SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
+ DAG.getNode(ISD::SHL, DL, MVT::i32, E,
+ DAG.getConstant(12, DL, MVT::i32)));
+
+ // B = clamp(1-E, 0, 13);
+ SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
+ One, E);
+ SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
+ B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
+ DAG.getConstant(13, DL, MVT::i32));
+
+ SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
+ DAG.getConstant(0x1000, DL, MVT::i32));
+
+ SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
+ SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
+ SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
+ D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
+
+ SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
+ SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
+ DAG.getConstant(0x7, DL, MVT::i32));
+ V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
+ DAG.getConstant(2, DL, MVT::i32));
+ SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
+ One, Zero, ISD::SETEQ);
+ SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
+ One, Zero, ISD::SETGT);
+ V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
+ V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
+
+ V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
+ DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
+ V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
+ I, V, ISD::SETEQ);
+
+ // Extract the sign bit.
+ SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
+ DAG.getConstant(16, DL, MVT::i32));
+ Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
+ DAG.getConstant(0x8000, DL, MVT::i32));
+
+ V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
+ return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
+}
+
SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
+ // TODO: Factor out code common with LowerFP_TO_UINT.
+
+ EVT SrcVT = Src.getValueType();
+ if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
+ SDLoc DL(Op);
+
+ SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+ SDValue FpToInt32 =
+ DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
+
+ return FpToInt32;
+ }
+
if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
return LowerFP64_TO_INT(Op, DAG, true);
@@ -2056,6 +2151,19 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
+ // TODO: Factor out code common with LowerFP_TO_SINT.
+
+ EVT SrcVT = Src.getValueType();
+ if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
+ SDLoc DL(Op);
+
+ SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+ SDValue FpToInt32 =
+ DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
+
+ return FpToInt32;
+ }
+
if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
return LowerFP64_TO_INT(Op, DAG, false);
@@ -2068,8 +2176,7 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
MVT VT = Op.getSimpleValueType();
MVT ScalarVT = VT.getScalarType();
- if (!VT.isVector())
- return SDValue();
+ assert(VT.isVector());
SDValue Src = Op.getOperand(0);
SDLoc DL(Op);
@@ -2108,17 +2215,20 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
(VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
}
-static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
+static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
+ TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Op = Node24->getOperand(OpIdx);
EVT VT = Op.getValueType();
APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
APInt KnownZero, KnownOne;
TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
- if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
- DCI.CommitTargetLoweringOpt(TLO);
+ if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI))
+ return true;
+
+ return false;
}
template <typename IntTy>
@@ -2188,6 +2298,9 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
// problems during legalization, the emitted instructions to pack and unpack
// the bytes again are not eliminated in the case of an unaligned copy.
if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+ if (VT.isVector())
+ return scalarizeVectorLoad(LN, DAG);
+
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
return DAG.getMergeValues(Ops, SDLoc(N));
@@ -2236,8 +2349,12 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
// order problems during legalization, the emitted instructions to pack and
// unpack the bytes again are not eliminated in the case of an unaligned
// copy.
- if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast))
+ if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+ if (VT.isVector())
+ return scalarizeVectorStore(SN, DAG);
+
return expandUnalignedStore(SN, DAG);
+ }
if (!IsFast)
return SDValue();
@@ -2262,38 +2379,21 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SN->getBasePtr(), SN->getMemOperand());
}
-// TODO: Should repeat for other bit ops.
-SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
- if (N->getValueType(0) != MVT::i64)
- return SDValue();
-
- // Break up 64-bit and of a constant into two 32-bit ands. This will typically
- // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer
- // combine opportunities since most 64-bit operations are decomposed this way.
- // TODO: We won't want this for SALU especially if it is an inline immediate.
- const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!RHS)
- return SDValue();
-
- uint64_t Val = RHS->getZExtValue();
- if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) {
- // If either half of the constant is 0, this is really a 32-bit and, so
- // split it. If we can re-use the full materialized constant, keep it.
- return SDValue();
- }
-
- SDLoc SL(N);
+/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
+/// binary operation \p Opc to it with the corresponding constant operands.
+SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
+ DAGCombinerInfo &DCI, const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ uint32_t ValLo, uint32_t ValHi) const {
SelectionDAG &DAG = DCI.DAG;
-
SDValue Lo, Hi;
- std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG);
+ std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
- SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32);
- SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+ SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
+ SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
- SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS);
- SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS);
+ SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
+ SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
// Re-visit the ands. It's possible we eliminated one of them and it could
// simplify the vector.
@@ -2408,11 +2508,40 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
}
+// We need to specifically handle i64 mul here to avoid unnecessary conversion
+// instructions. If we only match on the legalized i64 mul expansion,
+// SimplifyDemandedBits will be unable to remove them because there will be
+// multiple uses due to the separate mul + mulh[su].
+static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
+ SDValue N0, SDValue N1, unsigned Size, bool Signed) {
+ if (Size <= 32) {
+ unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+ return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
+ }
+
+ // Because we want to eliminate extension instructions before the
+ // operation, we need to create a single user here (i.e. not the separate
+ // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
+
+ unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
+
+ SDValue Mul = DAG.getNode(MulOpc, SL,
+ DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
+
+ return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
+ Mul.getValue(0), Mul.getValue(1));
+}
+
SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
- if (VT.isVector() || VT.getSizeInBits() > 32)
+ unsigned Size = VT.getSizeInBits();
+ if (VT.isVector() || Size > 64)
+ return SDValue();
+
+ // There are i16 integer mul/mad.
+ if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -2425,11 +2554,11 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
- Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
+ Mul = getMul24(DAG, DL, N0, N1, Size, false);
} else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
- Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
+ Mul = getMul24(DAG, DL, N0, N1, Size, true);
} else {
return SDValue();
}
@@ -2439,6 +2568,77 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
return DAG.getSExtOrTrunc(Mul, DL, VT);
}
+SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+
+ if (!Subtarget->hasMulI24() || VT.isVector())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (!isI24(N0, DAG) || !isI24(N1, DAG))
+ return SDValue();
+
+ N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+
+ SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
+ DCI.AddToWorklist(Mulhi.getNode());
+ return DAG.getSExtOrTrunc(Mulhi, DL, VT);
+}
+
+SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+
+ if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (!isU24(N0, DAG) || !isU24(N1, DAG))
+ return SDValue();
+
+ N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+
+ SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
+ DCI.AddToWorklist(Mulhi.getNode());
+ return DAG.getZExtOrTrunc(Mulhi, DL, VT);
+}
+
+SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+
+ // Simplify demanded bits before splitting into multiple users.
+ if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
+
+ unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+ unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
+
+ SDLoc SL(N);
+
+ SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
+ SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
+ return DAG.getMergeValues({ MulLo, MulHi }, SL);
+}
+
static bool isNegativeOne(SDValue Val) {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
return C->isAllOnesValue();
@@ -2449,23 +2649,21 @@ static bool isCtlzOpc(unsigned Opc) {
return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
}
-// Get FFBH node if the incoming op may have been type legalized from a smaller
-// type VT.
-// Need to match pre-legalized type because the generic legalization inserts the
-// add/sub between the select and compare.
-static SDValue getFFBH_U32(const TargetLowering &TLI, SelectionDAG &DAG,
- const SDLoc &SL, SDValue Op) {
+SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
+ SDValue Op,
+ const SDLoc &DL) const {
EVT VT = Op.getValueType();
- EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
- if (LegalVT != MVT::i32)
+ EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
+ if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
+ LegalVT != MVT::i16))
return SDValue();
if (VT != MVT::i32)
- Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op);
+ Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
- SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op);
+ SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op);
if (VT != MVT::i32)
- FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH);
+ FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH);
return FFBH;
}
@@ -2493,7 +2691,7 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
isCtlzOpc(RHS.getOpcode()) &&
RHS.getOperand(0) == CmpLHS &&
isNegativeOne(LHS)) {
- return getFFBH_U32(*this, DAG, SL, CmpLHS);
+ return getFFBH_U32(DAG, CmpLHS, SL);
}
// select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
@@ -2501,14 +2699,99 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
isCtlzOpc(LHS.getOpcode()) &&
LHS.getOperand(0) == CmpLHS &&
isNegativeOne(RHS)) {
- return getFFBH_U32(*this, DAG, SL, CmpLHS);
+ return getFFBH_U32(DAG, CmpLHS, SL);
+ }
+
+ return SDValue();
+}
+
+static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
+ unsigned Op,
+ const SDLoc &SL,
+ SDValue Cond,
+ SDValue N1,
+ SDValue N2) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N1.getValueType();
+
+ SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
+ N1.getOperand(0), N2.getOperand(0));
+ DCI.AddToWorklist(NewSelect.getNode());
+ return DAG.getNode(Op, SL, VT, NewSelect);
+}
+
+// Pull a free FP operation out of a select so it may fold into uses.
+//
+// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
+// select c, (fneg x), k -> fneg (select c, x, (fneg k))
+//
+// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
+// select c, (fabs x), +k -> fabs (select c, x, k)
+static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+ SDValue N) {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Cond = N.getOperand(0);
+ SDValue LHS = N.getOperand(1);
+ SDValue RHS = N.getOperand(2);
+
+ EVT VT = N.getValueType();
+ if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
+ (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
+ return distributeOpThroughSelect(DCI, LHS.getOpcode(),
+ SDLoc(N), Cond, LHS, RHS);
+ }
+
+ bool Inv = false;
+ if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
+ std::swap(LHS, RHS);
+ Inv = true;
+ }
+
+ // TODO: Support vector constants.
+ ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+ if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
+ SDLoc SL(N);
+ // If one side is an fneg/fabs and the other is a constant, we can push the
+ // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
+ SDValue NewLHS = LHS.getOperand(0);
+ SDValue NewRHS = RHS;
+
+ // Careful: if the neg can be folded up, don't try to pull it back down.
+ bool ShouldFoldNeg = true;
+
+ if (NewLHS.hasOneUse()) {
+ unsigned Opc = NewLHS.getOpcode();
+ if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
+ ShouldFoldNeg = false;
+ if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
+ ShouldFoldNeg = false;
+ }
+
+ if (ShouldFoldNeg) {
+ if (LHS.getOpcode() == ISD::FNEG)
+ NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ else if (CRHS->isNegative())
+ return SDValue();
+
+ if (Inv)
+ std::swap(NewLHS, NewRHS);
+
+ SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
+ Cond, NewLHS, NewRHS);
+ DCI.AddToWorklist(NewSelect.getNode());
+ return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
+ }
}
return SDValue();
}
+
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
+ return Folded;
+
SDValue Cond = N->getOperand(0);
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
@@ -2521,6 +2804,25 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
SDValue True = N->getOperand(1);
SDValue False = N->getOperand(2);
+ if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
+ SelectionDAG &DAG = DCI.DAG;
+ if ((DAG.isConstantValueOfAnyType(True) ||
+ DAG.isConstantValueOfAnyType(True)) &&
+ (!DAG.isConstantValueOfAnyType(False) &&
+ !DAG.isConstantValueOfAnyType(False))) {
+ // Swap cmp + select pair to move constant to false input.
+ // This will allow using VOPC cndmasks more often.
+ // select (setcc x, y), k, x -> select (setcc y, x) x, x
+
+ SDLoc SL(N);
+ ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+ LHS.getValueType().isInteger());
+
+ SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
+ return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
+ }
+ }
+
if (VT == MVT::f32 && Cond.hasOneUse()) {
SDValue MinMax
= CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
@@ -2533,6 +2835,135 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
}
+SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ unsigned Opc = N0.getOpcode();
+
+ // If the input has multiple uses and we can either fold the negate down, or
+ // the other uses cannot, give up. This both prevents unprofitable
+ // transformations and infinite loops: we won't repeatedly try to fold around
+ // a negate that has no 'good' form.
+ //
+ // TODO: Check users can fold
+ if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
+ return SDValue();
+
+ SDLoc SL(N);
+ switch (Opc) {
+ case ISD::FADD: {
+ if (!mayIgnoreSignedZero(N0))
+ return SDValue();
+
+ // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
+ SDValue LHS = N0.getOperand(0);
+ SDValue RHS = N0.getOperand(1);
+
+ if (LHS.getOpcode() != ISD::FNEG)
+ LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
+ else
+ LHS = LHS.getOperand(0);
+
+ if (RHS.getOpcode() != ISD::FNEG)
+ RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ else
+ RHS = RHS.getOperand(0);
+
+ SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS);
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
+ case ISD::FMUL:
+ case AMDGPUISD::FMUL_LEGACY: {
+ // (fneg (fmul x, y)) -> (fmul x, (fneg y))
+ // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
+ SDValue LHS = N0.getOperand(0);
+ SDValue RHS = N0.getOperand(1);
+
+ if (LHS.getOpcode() == ISD::FNEG)
+ LHS = LHS.getOperand(0);
+ else if (RHS.getOpcode() == ISD::FNEG)
+ RHS = RHS.getOperand(0);
+ else
+ RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+
+ SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS);
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
+ case ISD::FMA:
+ case ISD::FMAD: {
+ if (!mayIgnoreSignedZero(N0))
+ return SDValue();
+
+ // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
+ SDValue LHS = N0.getOperand(0);
+ SDValue MHS = N0.getOperand(1);
+ SDValue RHS = N0.getOperand(2);
+
+ if (LHS.getOpcode() == ISD::FNEG)
+ LHS = LHS.getOperand(0);
+ else if (MHS.getOpcode() == ISD::FNEG)
+ MHS = MHS.getOperand(0);
+ else
+ MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
+
+ if (RHS.getOpcode() != ISD::FNEG)
+ RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ else
+ RHS = RHS.getOperand(0);
+
+ SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
+ case ISD::FP_EXTEND:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RCP_LEGACY:
+ case ISD::FSIN:
+ case AMDGPUISD::SIN_HW: {
+ SDValue CvtSrc = N0.getOperand(0);
+ if (CvtSrc.getOpcode() == ISD::FNEG) {
+ // (fneg (fp_extend (fneg x))) -> (fp_extend x)
+ // (fneg (rcp (fneg x))) -> (rcp x)
+ return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
+ }
+
+ if (!N0.hasOneUse())
+ return SDValue();
+
+ // (fneg (fp_extend x)) -> (fp_extend (fneg x))
+ // (fneg (rcp x)) -> (rcp (fneg x))
+ SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
+ return DAG.getNode(Opc, SL, VT, Neg);
+ }
+ case ISD::FP_ROUND: {
+ SDValue CvtSrc = N0.getOperand(0);
+
+ if (CvtSrc.getOpcode() == ISD::FNEG) {
+ // (fneg (fp_round (fneg x))) -> (fp_round x)
+ return DAG.getNode(ISD::FP_ROUND, SL, VT,
+ CvtSrc.getOperand(0), N0.getOperand(1));
+ }
+
+ if (!N0.hasOneUse())
+ return SDValue();
+
+ // (fneg (fp_round x)) -> (fp_round (fneg x))
+ SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
+ return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
+ }
+ default:
+ return SDValue();
+ }
+}
+
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -2543,6 +2974,33 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
break;
case ISD::BITCAST: {
EVT DestVT = N->getValueType(0);
+
+ // Push casts through vector builds. This helps avoid emitting a large
+ // number of copies when materializing floating point vector constants.
+ //
+ // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
+ // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
+ if (DestVT.isVector()) {
+ SDValue Src = N->getOperand(0);
+ if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+ EVT SrcVT = Src.getValueType();
+ unsigned NElts = DestVT.getVectorNumElements();
+
+ if (SrcVT.getVectorNumElements() == NElts) {
+ EVT DestEltVT = DestVT.getVectorElementType();
+
+ SmallVector<SDValue, 8> CastedElts;
+ SDLoc SL(N);
+ for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
+ SDValue Elt = Src.getOperand(I);
+ CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
+ }
+
+ return DAG.getBuildVector(DestVT, SL, CastedElts);
+ }
+ }
+ }
+
if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
break;
@@ -2591,24 +3049,28 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performSraCombine(N, DCI);
}
- case ISD::AND: {
- if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
- break;
-
- return performAndCombine(N, DCI);
- }
case ISD::MUL:
return performMulCombine(N, DCI);
+ case ISD::MULHS:
+ return performMulhsCombine(N, DCI);
+ case ISD::MULHU:
+ return performMulhuCombine(N, DCI);
case AMDGPUISD::MUL_I24:
- case AMDGPUISD::MUL_U24: {
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- simplifyI24(N0, DCI);
- simplifyI24(N1, DCI);
+ case AMDGPUISD::MUL_U24:
+ case AMDGPUISD::MULHI_I24:
+ case AMDGPUISD::MULHI_U24: {
+ // If the first call to simplify is successfull, then N may end up being
+ // deleted, so we shouldn't call simplifyI24 again.
+ simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
return SDValue();
}
+ case AMDGPUISD::MUL_LOHI_I24:
+ case AMDGPUISD::MUL_LOHI_U24:
+ return performMulLoHi24Combine(N, DCI);
case ISD::SELECT:
return performSelectCombine(N, DCI);
+ case ISD::FNEG:
+ return performFNegCombine(N, DCI);
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
assert(!N->getValueType(0).isVector() &&
@@ -2705,38 +3167,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
// Helper functions
//===----------------------------------------------------------------------===//
-void AMDGPUTargetLowering::getOriginalFunctionArgs(
- SelectionDAG &DAG,
- const Function *F,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SmallVectorImpl<ISD::InputArg> &OrigIns) const {
-
- for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
- if (Ins[i].ArgVT == Ins[i].VT) {
- OrigIns.push_back(Ins[i]);
- continue;
- }
-
- EVT VT;
- if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
- // Vector has been split into scalars.
- VT = Ins[i].ArgVT.getVectorElementType();
- } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
- Ins[i].ArgVT.getVectorElementType() !=
- Ins[i].VT.getVectorElementType()) {
- // Vector elements have been promoted
- VT = Ins[i].ArgVT;
- } else {
- // Vector has been spilt into smaller vectors.
- VT = Ins[i].VT;
- }
-
- ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
- Ins[i].OrigArgIndex, Ins[i].PartOffset);
- OrigIns.push_back(Arg);
- }
-}
-
SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
unsigned Reg, EVT VT) const {
@@ -2754,7 +3184,8 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
- uint64_t ArgOffset = MFI->ABIArgOffset;
+ unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
+ uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
switch (Param) {
case GRID_DIM:
return ArgOffset;
@@ -2779,6 +3210,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RETURN)
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
+ NODE_NAME_CASE(SETCC)
+ NODE_NAME_CASE(SETREG)
+ NODE_NAME_CASE(FMA_W_CHAIN)
+ NODE_NAME_CASE(FMUL_W_CHAIN)
NODE_NAME_CASE(CLAMP)
NODE_NAME_CASE(COS_HW)
NODE_NAME_CASE(SIN_HW)
@@ -2800,7 +3235,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(TRIG_PREOP)
NODE_NAME_CASE(RCP)
NODE_NAME_CASE(RSQ)
+ NODE_NAME_CASE(RCP_LEGACY)
NODE_NAME_CASE(RSQ_LEGACY)
+ NODE_NAME_CASE(FMUL_LEGACY)
NODE_NAME_CASE(RSQ_CLAMP)
NODE_NAME_CASE(LDEXP)
NODE_NAME_CASE(FP_CLASS)
@@ -2812,12 +3249,19 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BFI)
NODE_NAME_CASE(BFM)
NODE_NAME_CASE(FFBH_U32)
+ NODE_NAME_CASE(FFBH_I32)
NODE_NAME_CASE(MUL_U24)
NODE_NAME_CASE(MUL_I24)
+ NODE_NAME_CASE(MULHI_U24)
+ NODE_NAME_CASE(MULHI_I24)
+ NODE_NAME_CASE(MUL_LOHI_U24)
+ NODE_NAME_CASE(MUL_LOHI_I24)
NODE_NAME_CASE(MAD_U24)
NODE_NAME_CASE(MAD_I24)
NODE_NAME_CASE(TEXTURE_FETCH)
NODE_NAME_CASE(EXPORT)
+ NODE_NAME_CASE(EXPORT_DONE)
+ NODE_NAME_CASE(R600_EXPORT)
NODE_NAME_CASE(CONST_ADDRESS)
NODE_NAME_CASE(REGISTER_LOAD)
NODE_NAME_CASE(REGISTER_STORE)
@@ -2833,8 +3277,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
+ NODE_NAME_CASE(KILL)
+ NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(SENDMSG)
+ NODE_NAME_CASE(SENDMSGHALT)
NODE_NAME_CASE(INTERP_MOV)
NODE_NAME_CASE(INTERP_P1)
NODE_NAME_CASE(INTERP_P2)
@@ -2844,16 +3291,18 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
NODE_NAME_CASE(ATOMIC_INC)
NODE_NAME_CASE(ATOMIC_DEC)
+ NODE_NAME_CASE(BUFFER_LOAD)
+ NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
return nullptr;
}
-SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
- DAGCombinerInfo &DCI,
- unsigned &RefinementSteps,
- bool &UseOneConstNR) const {
- SelectionDAG &DAG = DCI.DAG;
+SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
+ SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps,
+ bool &UseOneConstNR,
+ bool Reciprocal) const {
EVT VT = Operand.getValueType();
if (VT == MVT::f32) {
@@ -2868,9 +3317,8 @@ SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
}
SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
- DAGCombinerInfo &DCI,
- unsigned &RefinementSteps) const {
- SelectionDAG &DAG = DCI.DAG;
+ SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps) const {
EVT VT = Operand.getValueType();
if (VT == MVT::f32) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index c2c7585..f6adcea 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -25,19 +25,19 @@ class AMDGPUSubtarget;
class MachineRegisterInfo;
class AMDGPUTargetLowering : public TargetLowering {
+private:
+ /// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been
+ /// legalized from a smaller type VT. Need to match pre-legalized type because
+ /// the generic legalization inserts the add/sub between the select and
+ /// compare.
+ SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const;
+
protected:
const AMDGPUSubtarget *Subtarget;
- SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV,
- const SDValue &InitPtr,
- SDValue Chain,
- SelectionDAG &DAG) const;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
- /// \brief Lower vector stores by merging the vector elements into an integer
- /// of the same bitwidth.
- SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
/// \brief Split a vector store into multiple scalar stores.
/// \returns The resulting chain.
@@ -60,6 +60,7 @@ protected:
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+ SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
@@ -69,17 +70,23 @@ protected:
bool shouldCombineMemoryType(EVT VT) const;
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ uint32_t ValLo, uint32_t ValHi) const;
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
SDValue RHS, DAGCombinerInfo &DCI) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
- static EVT getEquivalentBitType(LLVMContext &Context, EVT VT);
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const;
@@ -102,16 +109,8 @@ protected:
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) const;
- /// The SelectionDAGBuilder will automatically promote function arguments
- /// with illegal types. However, this does not work for the AMDGPU targets
- /// since the function arguments are stored in memory as these illegal types.
- /// In order to handle this properly we need to get the origianl types sizes
- /// from the LLVM IR Function and fixup the ISD:InputArg values before
- /// passing them to AnalyzeFormalArguments()
- void getOriginalFunctionArgs(SelectionDAG &DAG,
- const Function *F,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SmallVectorImpl<ISD::InputArg> &OrigIns) const;
+ void analyzeFormalArgumentsCompute(CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const;
void AnalyzeFormalArguments(CCState &State,
const SmallVectorImpl<ISD::InputArg> &Ins) const;
void AnalyzeReturn(CCState &State,
@@ -120,6 +119,16 @@ protected:
public:
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
+ bool mayIgnoreSignedZero(SDValue Op) const {
+ if (getTargetMachine().Options.UnsafeFPMath) // FIXME: nsz only
+ return true;
+
+ if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(Op))
+ return BO->Flags.hasNoSignedZeros();
+
+ return false;
+ }
+
bool isFAbsFree(EVT VT) const override;
bool isFNegFree(EVT VT) const override;
bool isTruncateFree(EVT Src, EVT Dest) const override;
@@ -171,13 +180,14 @@ public:
const char* getTargetNodeName(unsigned Opcode) const override;
- SDValue getRsqrtEstimate(SDValue Operand,
- DAGCombinerInfo &DCI,
- unsigned &RefinementSteps,
- bool &UseOneConstNR) const override;
- SDValue getRecipEstimate(SDValue Operand,
- DAGCombinerInfo &DCI,
- unsigned &RefinementSteps) const override;
+ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
+ return true;
+ }
+ SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps, bool &UseOneConstNR,
+ bool Reciprocal) const override;
+ SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps) const override;
virtual SDNode *PostISelFolding(MachineSDNode *N,
SelectionDAG &DAG) const = 0;
@@ -228,6 +238,13 @@ enum NodeType : unsigned {
DWORDADDR,
FRACT,
CLAMP,
+ // This is SETCC with the full mask result which is used for a compare with a
+ // result bit per item in the wavefront.
+ SETCC,
+ SETREG,
+ // FP ops with input and output chain.
+ FMA_W_CHAIN,
+ FMUL_W_CHAIN,
// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
// Denormals handled on some parts.
@@ -254,7 +271,9 @@ enum NodeType : unsigned {
// For f64, max error 2^29 ULP, handles denormals.
RCP,
RSQ,
+ RCP_LEGACY,
RSQ_LEGACY,
+ FMUL_LEGACY,
RSQ_CLAMP,
LDEXP,
FP_CLASS,
@@ -266,12 +285,19 @@ enum NodeType : unsigned {
BFI, // (src0 & src1) | (~src0 & src2)
BFM, // Insert a range of bits into a 32-bit word.
FFBH_U32, // ctlz with -1 if input is zero.
+ FFBH_I32,
MUL_U24,
MUL_I24,
+ MULHI_U24,
+ MULHI_I24,
MAD_U24,
MAD_I24,
+ MUL_LOHI_I24,
+ MUL_LOHI_U24,
TEXTURE_FETCH,
- EXPORT,
+ EXPORT, // exp on SI+
+ EXPORT_DONE, // exp on SI+ with done bit set
+ R600_EXPORT,
CONST_ADDRESS,
REGISTER_LOAD,
REGISTER_STORE,
@@ -298,10 +324,13 @@ enum NodeType : unsigned {
/// Pointer to the start of the shader's constant data.
CONST_DATA_PTR,
SENDMSG,
+ SENDMSGHALT,
INTERP_MOV,
INTERP_P1,
INTERP_P2,
PC_ADD_REL_OFFSET,
+ KILL,
+ DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
STORE_MSKOR,
LOAD_CONSTANT,
@@ -309,6 +338,8 @@ enum NodeType : unsigned {
ATOMIC_CMP_SWAP,
ATOMIC_INC,
ATOMIC_DEC,
+ BUFFER_LOAD,
+ BUFFER_LOAD_FORMAT,
LAST_AMDGPU_ISD_NUMBER
};
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 9a00ecb..e4dc659 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -23,7 +23,6 @@
using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
-#define GET_INSTRINFO_NAMED_OPS
#define GET_INSTRMAP_INFO
#include "AMDGPUGenInstrInfo.inc"
@@ -33,10 +32,6 @@ void AMDGPUInstrInfo::anchor() {}
AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
: AMDGPUGenInstrInfo(-1, -1), ST(ST) {}
-bool AMDGPUInstrInfo::enableClusterLoads() const {
- return true;
-}
-
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
// the first 16 loads will be interleaved with the stores, and the next 16 will
// be clustered as expected. It should really split into 2 16 store batches.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index a59eafa..bd8e389 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -17,17 +17,12 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
#include "llvm/Target/TargetInstrInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#define GET_INSTRINFO_HEADER
#define GET_INSTRINFO_ENUM
-#define GET_INSTRINFO_OPERAND_ENUM
#include "AMDGPUGenInstrInfo.inc"
-#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT
-#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT
-#define OPCODE_IS_ZERO AMDGPU::PRED_SETE
-#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE
-
namespace llvm {
class AMDGPUSubtarget;
@@ -44,8 +39,6 @@ private:
public:
explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
- bool enableClusterLoads() const override;
-
bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
int64_t Offset1, int64_t Offset2,
unsigned NumLoads) const override;
@@ -59,15 +52,6 @@ public:
/// equivalent opcode that writes \p Channels Channels.
int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
};
-
-namespace AMDGPU {
- LLVM_READONLY
- int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
-} // End namespace AMDGPU
-
} // End llvm namespace
-#define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63)
-#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62)
-
#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 2b13bb9..d7fa28b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -40,6 +40,8 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4,
[SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>]
>;
+def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
//===----------------------------------------------------------------------===//
// AMDGPU DAG Nodes
//
@@ -52,6 +54,9 @@ def AMDGPUconstdata_ptr : SDNode<
// This argument to this node is a dword address.
def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
+// Force dependencies for vector trunc stores
+def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>;
+
def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
@@ -65,6 +70,7 @@ def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a)
+def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a) result clamped to +/- max_float.
@@ -82,6 +88,10 @@ def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
[]
>;
+def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]
+>;
+
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
// out = max(a, b) a and b are signed ints
@@ -137,6 +147,24 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
// out = (src1 > src0) ? 1 : 0
def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;
+def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc
+ SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
+]>;
+
+def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
+
+def AMDGPUSetRegOp : SDTypeProfile<0, 2, [
+ SDTCisInt<0>, SDTCisInt<1>
+]>;
+
+def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [
+ SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
+ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [
+ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
SDTIntToFPOp, []>;
@@ -202,14 +230,22 @@ def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
+def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
-// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when
-// performing the mulitply. The result is a 32-bit value.
+// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
+// when performing the mulitply. The result is a 32-bit value.
def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
- [SDNPCommutative]
+ [SDNPCommutative, SDNPAssociative]
>;
def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
- [SDNPCommutative]
+ [SDNPCommutative, SDNPAssociative]
+>;
+
+def AMDGPUmulhi_u24 : SDNode<"AMDGPUISD::MULHI_U24", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]
+>;
+def AMDGPUmulhi_i24 : SDNode<"AMDGPUISD::MULHI_I24", SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]
>;
def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
@@ -233,6 +269,10 @@ def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
SDTypeProfile<0, 1, [SDTCisInt<0>]>,
[SDNPHasChain, SDNPInGlue]>;
+def AMDGPUsendmsghalt : SDNode<"AMDGPUISD::SENDMSGHALT",
+ SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+ [SDNPHasChain, SDNPInGlue]>;
+
def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV",
SDTypeProfile<1, 3, [SDTCisFP<0>]>,
[SDNPInGlue]>;
@@ -245,6 +285,35 @@ def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2",
SDTypeProfile<1, 4, [SDTCisFP<0>]>,
[SDNPInGlue]>;
+
+def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+// SI+ export
+def AMDGPUExportOp : SDTypeProfile<0, 8, [
+ SDTCisInt<0>, // i8 en
+ SDTCisInt<1>, // i1 vm
+ // skip done
+ SDTCisInt<2>, // i8 tgt
+ SDTCisSameAs<3, 1>, // i1 compr
+ SDTCisFP<4>, // f32 src0
+ SDTCisSameAs<5, 4>, // f32 src1
+ SDTCisSameAs<6, 4>, // f32 src2
+ SDTCisSameAs<7, 4> // f32 src3
+]>;
+
+def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp,
+ [SDNPHasChain, SDNPMayStore]>;
+
+def AMDGPUexport_done: SDNode<"AMDGPUISD::EXPORT_DONE", AMDGPUExportOp,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
+
+
+def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
+
+def R600_EXPORT: SDNode<"AMDGPUISD::R600_EXPORT", R600ExportOp,
+ [SDNPHasChain, SDNPSideEffect]>;
+
//===----------------------------------------------------------------------===//
// Flow Control Profile Types
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 3944fdb..59cba63 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -42,6 +42,7 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm = "",
field bits<32> Inst = 0xffffffff;
}
+def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">;
def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">;
def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
@@ -49,13 +50,6 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
-// 32-bit VALU immediate operand that uses the constant bus.
-def u32kimm : Operand<i32> {
- let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_KIMM32";
- let PrintMethod = "printU32ImmOperand";
-}
-
let OperandType = "OPERAND_IMMEDIATE" in {
def u32imm : Operand<i32> {
@@ -172,6 +166,12 @@ class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
[{ return N->hasOneUse(); }]
>;
+class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
+ (ops node:$src0, node:$src1, node:$src2),
+ (op $src0, $src1, $src2),
+ [{ return N->hasOneUse(); }]
+>;
+
//===----------------------------------------------------------------------===//
// Load/Store Pattern Fragments
//===----------------------------------------------------------------------===//
@@ -363,53 +363,54 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>;
-def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
- (AMDGPUstore_mskor node:$val, node:$ptr), [{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
-}]>;
+multiclass global_binary_atomic_op<SDNode atomic_op> {
+ def "" : PatFrag<
+ (ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+
+ def _noret : PatFrag<
+ (ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+
+ def _ret : PatFrag<
+ (ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+}
-class global_binary_atomic_op<SDNode atomic_op> : PatFrag<
- (ops node:$ptr, node:$value),
- (atomic_op node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]
->;
-
-class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
- (ops node:$ptr, node:$value),
- (atomic_op node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}]
->;
-
-def atomic_swap_global : global_binary_atomic_op<atomic_swap>;
-def atomic_add_global : global_binary_atomic_op<atomic_load_add>;
-def atomic_and_global : global_binary_atomic_op<atomic_load_and>;
-def atomic_max_global : global_binary_atomic_op<atomic_load_max>;
-def atomic_min_global : global_binary_atomic_op<atomic_load_min>;
-def atomic_or_global : global_binary_atomic_op<atomic_load_or>;
-def atomic_sub_global : global_binary_atomic_op<atomic_load_sub>;
-def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
-def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
-def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
-
-def atomic_cmp_swap_global : global_binary_atomic_op<AMDGPUatomic_cmp_swap>;
-def atomic_cmp_swap_global_nortn : PatFrag<
- (ops node:$ptr, node:$value),
- (atomic_cmp_swap_global node:$ptr, node:$value),
- [{ return SDValue(N, 0).use_empty(); }]
->;
-
-def atomic_swap_flat : flat_binary_atomic_op<atomic_swap>;
-def atomic_add_flat : flat_binary_atomic_op<atomic_load_add>;
-def atomic_and_flat : flat_binary_atomic_op<atomic_load_and>;
-def atomic_max_flat : flat_binary_atomic_op<atomic_load_max>;
-def atomic_min_flat : flat_binary_atomic_op<atomic_load_min>;
-def atomic_or_flat : flat_binary_atomic_op<atomic_load_or>;
-def atomic_sub_flat : flat_binary_atomic_op<atomic_load_sub>;
-def atomic_umax_flat : flat_binary_atomic_op<atomic_load_umax>;
-def atomic_umin_flat : flat_binary_atomic_op<atomic_load_umin>;
-def atomic_xor_flat : flat_binary_atomic_op<atomic_load_xor>;
-
-def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>;
+defm atomic_swap_global : global_binary_atomic_op<atomic_swap>;
+defm atomic_add_global : global_binary_atomic_op<atomic_load_add>;
+defm atomic_and_global : global_binary_atomic_op<atomic_load_and>;
+defm atomic_max_global : global_binary_atomic_op<atomic_load_max>;
+defm atomic_min_global : global_binary_atomic_op<atomic_load_min>;
+defm atomic_or_global : global_binary_atomic_op<atomic_load_or>;
+defm atomic_sub_global : global_binary_atomic_op<atomic_load_sub>;
+defm atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
+defm atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
+defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
+
+//legacy
+def AMDGPUatomic_cmp_swap_global : PatFrag<
+ (ops node:$ptr, node:$value),
+ (AMDGPUatomic_cmp_swap node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+
+def atomic_cmp_swap_global : PatFrag<
+ (ops node:$ptr, node:$cmp, node:$value),
+ (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+
+def atomic_cmp_swap_global_noret : PatFrag<
+ (ops node:$ptr, node:$cmp, node:$value),
+ (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+
+def atomic_cmp_swap_global_ret : PatFrag<
+ (ops node:$ptr, node:$cmp, node:$value),
+ (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
//===----------------------------------------------------------------------===//
// Misc Pattern Fragments
@@ -420,6 +421,7 @@ int TWO_PI = 0x40c90fdb;
int PI = 0x40490fdb;
int TWO_PI_INV = 0x3e22f983;
int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding
+int FP16_ONE = 0x3C00;
int FP32_ONE = 0x3f800000;
int FP32_NEG_ONE = 0xbf800000;
int FP64_ONE = 0x3ff0000000000000;
@@ -559,17 +561,26 @@ multiclass BFIPatterns <Instruction BFI_INT,
def : Pat <
(fcopysign f32:$src0, f32:$src1),
- (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1)
+ (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
>;
def : Pat <
(f64 (fcopysign f64:$src0, f64:$src1)),
(REG_SEQUENCE RC64,
(i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
- (BFI_INT (LoadImm32 0x7fffffff),
+ (BFI_INT (LoadImm32 (i32 0x7fffffff)),
(i32 (EXTRACT_SUBREG $src0, sub1)),
(i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
>;
+
+ def : Pat <
+ (f64 (fcopysign f64:$src0, f32:$src1)),
+ (REG_SEQUENCE RC64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (BFI_INT (LoadImm32 (i32 0x7fffffff)),
+ (i32 (EXTRACT_SUBREG $src0, sub1)),
+ $src1), sub1)
+ >;
}
// SHA-256 Ma patterns
@@ -620,9 +631,9 @@ def umax_oneuse : HasOneUseBinOp<umax>;
def umin_oneuse : HasOneUseBinOp<umin>;
} // Properties = [SDNPCommutative, SDNPAssociative]
+def sub_oneuse : HasOneUseBinOp<sub>;
-// 24-bit arithmetic patterns
-def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>;
+def select_oneuse : HasOneUseTernaryOp<select>;
// Special conversion patterns
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index 2127391..ceae0b5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -16,6 +16,8 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
def int_AMDGPU_kilp : Intrinsic<[], [], []>;
+
+ // Deprecated in favor of llvm.amdgcn.sffbh
def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
// Deprecated in favor of separate int_amdgcn_cube* intrinsics.
@@ -29,9 +31,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
def int_AMDGPU_rsq : Intrinsic<
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
>;
-
- // Deprecated in favor of llvm.amdgcn.read.workdim
- def int_AMDGPU_read_workdim : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
}
include "SIIntrinsics.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index ad8d3e4..7d56355 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -36,13 +36,92 @@
using namespace llvm;
-AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
- Ctx(ctx), ST(st) { }
+#include "AMDGPUGenMCPseudoLowering.inc"
+
+
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st,
+ const AsmPrinter &ap):
+ Ctx(ctx), ST(st), AP(ap) { }
static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) {
switch (MOFlags) {
- default: return MCSymbolRefExpr::VK_None;
- case SIInstrInfo::MO_GOTPCREL: return MCSymbolRefExpr::VK_GOTPCREL;
+ default:
+ return MCSymbolRefExpr::VK_None;
+ case SIInstrInfo::MO_GOTPCREL:
+ return MCSymbolRefExpr::VK_GOTPCREL;
+ case SIInstrInfo::MO_GOTPCREL32_LO:
+ return MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_LO;
+ case SIInstrInfo::MO_GOTPCREL32_HI:
+ return MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_HI;
+ case SIInstrInfo::MO_REL32_LO:
+ return MCSymbolRefExpr::VK_AMDGPU_REL32_LO;
+ case SIInstrInfo::MO_REL32_HI:
+ return MCSymbolRefExpr::VK_AMDGPU_REL32_HI;
+ }
+}
+
+const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr(
+ const MachineBasicBlock &SrcBB,
+ const MachineOperand &MO) const {
+ const MCExpr *DestBBSym
+ = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx);
+ const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx);
+
+ assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 &&
+ ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4);
+
+ // s_getpc_b64 returns the address of next instruction.
+ const MCConstantExpr *One = MCConstantExpr::create(4, Ctx);
+ SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx);
+
+ if (MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_FORWARD)
+ return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx);
+
+ assert(MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_BACKWARD);
+ return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx);
+}
+
+bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
+ MCOperand &MCOp) const {
+ switch (MO.getType()) {
+ default:
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ return true;
+ case MachineOperand::MO_Register:
+ MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST));
+ return true;
+ case MachineOperand::MO_MachineBasicBlock: {
+ if (MO.getTargetFlags() != 0) {
+ MCOp = MCOperand::createExpr(
+ getLongBranchBlockExpr(*MO.getParent()->getParent(), MO));
+ } else {
+ MCOp = MCOperand::createExpr(
+ MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+ }
+
+ return true;
+ }
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MO.getGlobal();
+ SmallString<128> SymbolName;
+ AP.getNameWithPrefix(SymbolName, GV);
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(SymbolName);
+ const MCExpr *SymExpr =
+ MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx);
+ const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr,
+ MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+ MCOp = MCOperand::createExpr(Expr);
+ return true;
+ }
+ case MachineOperand::MO_ExternalSymbol: {
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
+ Sym->setExternal(true);
+ const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+ MCOp = MCOperand::createExpr(Expr);
+ return true;
+ }
}
}
@@ -60,44 +139,24 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
for (const MachineOperand &MO : MI->explicit_operands()) {
MCOperand MCOp;
- switch (MO.getType()) {
- default:
- llvm_unreachable("unknown operand type");
- case MachineOperand::MO_Immediate:
- MCOp = MCOperand::createImm(MO.getImm());
- break;
- case MachineOperand::MO_Register:
- MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST));
- break;
- case MachineOperand::MO_MachineBasicBlock:
- MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
- MO.getMBB()->getSymbol(), Ctx));
- break;
- case MachineOperand::MO_GlobalAddress: {
- const GlobalValue *GV = MO.getGlobal();
- MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName()));
- const MCExpr *SymExpr =
- MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx);
- const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr,
- MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
- MCOp = MCOperand::createExpr(Expr);
- break;
- }
- case MachineOperand::MO_ExternalSymbol: {
- MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
- Sym->setExternal(true);
- const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
- MCOp = MCOperand::createExpr(Expr);
- break;
- }
- }
+ lowerOperand(MO, MCOp);
OutMI.addOperand(MCOp);
}
}
+bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
+ MCOperand &MCOp) const {
+ const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+ AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
+ return MCInstLowering.lowerOperand(MO, MCOp);
+}
+
void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ if (emitPseudoExpansionLowering(*OutStreamer, MI))
+ return;
+
const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
- AMDGPUMCInstLower MCInstLowering(OutContext, STI);
+ AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
StringRef Err;
if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
@@ -137,6 +196,12 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
+ if (MI->getOpcode() == AMDGPU::WAVE_BARRIER) {
+ if (isVerbose())
+ OutStreamer->emitRawComment(" wave barrier");
+ return;
+ }
+
MCInst TmpInst;
MCInstLowering.lower(MI, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
index 957dcd0..57d2d85 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -5,7 +5,6 @@
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
-/// \file
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
@@ -14,16 +13,28 @@
namespace llvm {
class AMDGPUSubtarget;
+class AsmPrinter;
+class MachineBasicBlock;
class MachineInstr;
+class MachineOperand;
class MCContext;
+class MCExpr;
class MCInst;
+class MCOperand;
class AMDGPUMCInstLower {
MCContext &Ctx;
const AMDGPUSubtarget &ST;
+ const AsmPrinter &AP;
+
+ const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
+ const MachineOperand &MO) const;
public:
- AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST);
+ AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST,
+ const AsmPrinter &AP);
+
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
/// \brief Lower a MachineInstr to an MCInst
void lower(const MachineInstr *MI, MCInst &OutMI) const;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 44516da..40c3327 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -1,23 +1,47 @@
+//===-- AMDGPUMachineFunctionInfo.cpp ---------------------------------------=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
#include "AMDGPUMachineFunction.h"
+#include "AMDGPUSubtarget.h"
using namespace llvm;
-// Pin the vtable to this file.
-void AMDGPUMachineFunction::anchor() {}
-
AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MachineFunctionInfo(),
+ LocalMemoryObjects(),
KernArgSize(0),
MaxKernArgAlign(0),
LDSSize(0),
ABIArgOffset(0),
- ScratchSize(0),
- IsKernel(MF.getFunction()->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL ||
- MF.getFunction()->getCallingConv() == llvm::CallingConv::SPIR_KERNEL)
-{
+ IsKernel(MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ MF.getFunction()->getCallingConv() == CallingConv::SPIR_KERNEL) {
+ // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
+ // except reserved size is not correctly aligned.
}
-bool AMDGPUMachineFunction::isKernel() const
-{
- return IsKernel;
+unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
+ const GlobalValue &GV) {
+ auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0));
+ if (!Entry.second)
+ return Entry.first->second;
+
+ unsigned Align = GV.getAlignment();
+ if (Align == 0)
+ Align = DL.getABITypeAlignment(GV.getValueType());
+
+ /// TODO: We should sort these to minimize wasted space due to alignment
+ /// padding. Currently the padding is decided by the first encountered use
+ /// during lowering.
+ unsigned Offset = LDSSize = alignTo(LDSSize, Align);
+
+ Entry.first->second = Offset;
+ LDSSize += DL.getTypeAllocSize(GV.getValueType());
+
+ return Offset;
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 6b31f63..5d0640b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -11,15 +11,26 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
#include "llvm/CodeGen/MachineFunction.h"
-#include <map>
+#include "llvm/ADT/DenseMap.h"
namespace llvm {
class AMDGPUMachineFunction : public MachineFunctionInfo {
+ /// A map to keep track of local memory objects and their offsets within the
+ /// local memory space.
+ SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
+
uint64_t KernArgSize;
unsigned MaxKernArgAlign;
- virtual void anchor();
+ /// Number of bytes in the LDS that are being used.
+ unsigned LDSSize;
+
+ // FIXME: This should probably be removed.
+ /// Start of implicit kernel args
+ unsigned ABIArgOffset;
+
+ bool IsKernel;
public:
AMDGPUMachineFunction(const MachineFunction &MF);
@@ -35,19 +46,31 @@ public:
return Result;
}
- /// A map to keep track of local memory objects and their offsets within
- /// the local memory space.
- std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
- /// Number of bytes in the LDS that are being used.
- unsigned LDSSize;
+ uint64_t getKernArgSize() const {
+ return KernArgSize;
+ }
- /// Start of implicit kernel args
- unsigned ABIArgOffset;
+ unsigned getMaxKernArgAlign() const {
+ return MaxKernArgAlign;
+ }
- bool isKernel() const;
+ void setABIArgOffset(unsigned NewOffset) {
+ ABIArgOffset = NewOffset;
+ }
- unsigned ScratchSize;
- bool IsKernel;
+ unsigned getABIArgOffset() const {
+ return ABIArgOffset;
+ }
+
+ unsigned getLDSSize() const {
+ return LDSSize;
+ }
+
+ bool isKernel() const {
+ return IsKernel;
+ }
+
+ unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
};
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
index 8bc7b53..410bd52 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
@@ -358,7 +358,7 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
return transformKernels(M);
}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "AMDGPU OpenCL Image Type Pass";
}
};
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
new file mode 100644
index 0000000..947d45b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -0,0 +1,42 @@
+//===-- AMDGPUNoteType.h - AMDGPU ELF PT_NOTE section info-------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// Enums and constants for AMDGPU PT_NOTE sections.
+///
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H
+
+namespace AMDGPU {
+
+namespace PT_NOTE {
+
+const char SectionName[] = ".note";
+
+const char NoteName[] = "AMD";
+
+enum NoteType{
+ NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1,
+ NT_AMDGPU_HSA_HSAIL = 2,
+ NT_AMDGPU_HSA_ISA = 3,
+ NT_AMDGPU_HSA_PRODUCER = 4,
+ NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5,
+ NT_AMDGPU_HSA_EXTENSION = 6,
+ NT_AMDGPU_HSA_RUNTIME_METADATA = 7,
+ NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101,
+ NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
+};
+}
+}
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUNOTETYPE_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 0bad63f..baa28de 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -76,9 +76,7 @@ public:
bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;
- const char *getPassName() const override {
- return "AMDGPU Promote Alloca";
- }
+ StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
void handleAlloca(AllocaInst &I);
@@ -184,13 +182,12 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
// TODO: Have some sort of hint or other heuristics to guess occupancy based
// on other factors..
- unsigned OccupancyHint
- = AMDGPU::getIntegerAttribute(F, "amdgpu-max-waves-per-eu", 0);
+ unsigned OccupancyHint = ST.getWavesPerEU(F).second;
if (OccupancyHint == 0)
OccupancyHint = 7;
// Clamp to max value.
- OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerCU());
+ OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
// Check the hint but ignore it if it's obviously wrong from the existing LDS
// usage.
@@ -535,7 +532,7 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
std::vector<Value*> &WorkList) const {
for (User *User : Val->users()) {
- if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
+ if (is_contained(WorkList, User))
continue;
if (CallInst *CI = dyn_cast<CallInst>(User)) {
@@ -550,7 +547,7 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
if (UseInst->getOpcode() == Instruction::PtrToInt)
return false;
- if (LoadInst *LI = dyn_cast_or_null<LoadInst>(UseInst)) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
if (LI->isVolatile())
return false;
@@ -564,11 +561,10 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
// Reject if the stored value is not the pointer operand.
if (SI->getPointerOperand() != Val)
return false;
- } else if (AtomicRMWInst *RMW = dyn_cast_or_null<AtomicRMWInst>(UseInst)) {
+ } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
if (RMW->isVolatile())
return false;
- } else if (AtomicCmpXchgInst *CAS
- = dyn_cast_or_null<AtomicCmpXchgInst>(UseInst)) {
+ } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
if (CAS->isVolatile())
return false;
}
@@ -583,6 +579,12 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
WorkList.push_back(ICmp);
}
+ if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
+ // Don't collect the users of this.
+ WorkList.push_back(User);
+ continue;
+ }
+
if (!User->getType()->isPointerTy())
continue;
@@ -651,9 +653,11 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
return;
+ const AMDGPUSubtarget &ST =
+ TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
// FIXME: We should also try to get this value from the reqd_work_group_size
// function attribute if it is available.
- unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
+ unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
const DataLayout &DL = Mod->getDataLayout();
@@ -741,7 +745,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
continue;
}
- // The operand's value should be corrected on its own.
+ // The operand's value should be corrected on its own and we don't want to
+ // touch the users.
if (isa<AddrSpaceCastInst>(V))
continue;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
index 40f6394..ecd2ac7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
@@ -13,18 +13,13 @@
///
/// Runtime requests certain information (metadata) about kernels to be able
/// to execute the kernels and answer the queries about the kernels.
-/// The metadata is represented as a byte stream in an ELF section of a
-/// binary (code object). The byte stream consists of key-value pairs.
-/// Each key is an 8 bit unsigned integer. Each value can be an integer,
-/// a string, or a stream of key-value pairs. There are 3 levels of key-value
-/// pair streams. At the beginning of the ELF section is the top level
-/// key-value pair stream. A kernel-level key-value pair stream starts after
-/// encountering KeyKernelBegin and ends immediately before encountering
-/// KeyKernelEnd. A kernel-argument-level key-value pair stream starts
-/// after encountering KeyArgBegin and ends immediately before encountering
-/// KeyArgEnd. A kernel-level key-value pair stream can only appear in a top
-/// level key-value pair stream. A kernel-argument-level key-value pair stream
-/// can only appear in a kernel-level key-value pair stream.
+/// The metadata is represented as a note element in the .note ELF section of a
+/// binary (code object). The desc field of the note element is a YAML string
+/// consisting of key-value pairs. Each key is a string. Each value can be
+/// an integer, a string, or an YAML sequence. There are 3 levels of YAML maps.
+/// At the beginning of the YAML string is the module level YAML map. A
+/// kernel-level YAML map is in the amd.Kernels sequence. A
+/// kernel-argument-level map is in the amd.Args sequence.
///
/// The format should be kept backward compatible. New enum values and bit
/// fields should be appended at the end. It is suggested to bump up the
@@ -37,77 +32,64 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
-#include <stdint.h>
+#include <cstdint>
+#include <vector>
+#include <string>
namespace AMDGPU {
namespace RuntimeMD {
// Version and revision of runtime metadata
- const unsigned char MDVersion = 1;
+ const unsigned char MDVersion = 2;
const unsigned char MDRevision = 0;
- // ELF section name containing runtime metadata
- const char SectionName[] = ".AMDGPU.runtime_metadata";
-
- // Enumeration values of keys in runtime metadata.
- enum Key {
- KeyNull = 0, // Place holder. Ignored when encountered
- KeyMDVersion = 1, // Runtime metadata version
- KeyLanguage = 2, // Language
- KeyLanguageVersion = 3, // Language version
- KeyKernelBegin = 4, // Beginning of kernel-level stream
- KeyKernelEnd = 5, // End of kernel-level stream
- KeyKernelName = 6, // Kernel name
- KeyArgBegin = 7, // Beginning of kernel-arg-level stream
- KeyArgEnd = 8, // End of kernel-arg-level stream
- KeyArgSize = 9, // Kernel arg size
- KeyArgAlign = 10, // Kernel arg alignment
- KeyArgTypeName = 11, // Kernel type name
- KeyArgName = 12, // Kernel name
- KeyArgTypeKind = 13, // Kernel argument type kind
- KeyArgValueType = 14, // Kernel argument value type
- KeyArgAddrQual = 15, // Kernel argument address qualifier
- KeyArgAccQual = 16, // Kernel argument access qualifier
- KeyArgIsConst = 17, // Kernel argument is const qualified
- KeyArgIsRestrict = 18, // Kernel argument is restrict qualified
- KeyArgIsVolatile = 19, // Kernel argument is volatile qualified
- KeyArgIsPipe = 20, // Kernel argument is pipe qualified
- KeyReqdWorkGroupSize = 21, // Required work group size
- KeyWorkGroupSizeHint = 22, // Work group size hint
- KeyVecTypeHint = 23, // Vector type hint
- KeyKernelIndex = 24, // Kernel index for device enqueue
- KeySGPRs = 25, // Number of SGPRs
- KeyVGPRs = 26, // Number of VGPRs
- KeyMinWavesPerSIMD = 27, // Minimum number of waves per SIMD
- KeyMaxWavesPerSIMD = 28, // Maximum number of waves per SIMD
- KeyFlatWorkGroupSizeLimits = 29, // Flat work group size limits
- KeyMaxWorkGroupSize = 30, // Maximum work group size
- KeyNoPartialWorkGroups = 31, // No partial work groups
- };
-
- enum Language : uint8_t {
- OpenCL_C = 0,
- HCC = 1,
- OpenMP = 2,
- OpenCL_CPP = 3,
-};
-
- enum LanguageVersion : uint16_t {
- V100 = 100,
- V110 = 110,
- V120 = 120,
- V200 = 200,
- V210 = 210,
- };
+ // Name of keys for runtime metadata.
+ namespace KeyName {
+ const char MDVersion[] = "amd.MDVersion"; // Runtime metadata version
+ const char Language[] = "amd.Language"; // Language
+ const char LanguageVersion[] = "amd.LanguageVersion"; // Language version
+ const char Kernels[] = "amd.Kernels"; // Kernels
+ const char KernelName[] = "amd.KernelName"; // Kernel name
+ const char Args[] = "amd.Args"; // Kernel arguments
+ const char ArgSize[] = "amd.ArgSize"; // Kernel arg size
+ const char ArgAlign[] = "amd.ArgAlign"; // Kernel arg alignment
+ const char ArgTypeName[] = "amd.ArgTypeName"; // Kernel type name
+ const char ArgName[] = "amd.ArgName"; // Kernel name
+ const char ArgKind[] = "amd.ArgKind"; // Kernel argument kind
+ const char ArgValueType[] = "amd.ArgValueType"; // Kernel argument value type
+ const char ArgAddrQual[] = "amd.ArgAddrQual"; // Kernel argument address qualifier
+ const char ArgAccQual[] = "amd.ArgAccQual"; // Kernel argument access qualifier
+ const char ArgIsConst[] = "amd.ArgIsConst"; // Kernel argument is const qualified
+ const char ArgIsRestrict[] = "amd.ArgIsRestrict"; // Kernel argument is restrict qualified
+ const char ArgIsVolatile[] = "amd.ArgIsVolatile"; // Kernel argument is volatile qualified
+ const char ArgIsPipe[] = "amd.ArgIsPipe"; // Kernel argument is pipe qualified
+ const char ReqdWorkGroupSize[] = "amd.ReqdWorkGroupSize"; // Required work group size
+ const char WorkGroupSizeHint[] = "amd.WorkGroupSizeHint"; // Work group size hint
+ const char VecTypeHint[] = "amd.VecTypeHint"; // Vector type hint
+ const char KernelIndex[] = "amd.KernelIndex"; // Kernel index for device enqueue
+ const char NoPartialWorkGroups[] = "amd.NoPartialWorkGroups"; // No partial work groups
+ const char PrintfInfo[] = "amd.PrintfInfo"; // Prinf function call information
+ const char ArgActualAcc[] = "amd.ArgActualAcc"; // The actual kernel argument access qualifier
+ const char ArgPointeeAlign[] = "amd.ArgPointeeAlign"; // Alignment of pointee type
+ }
namespace KernelArg {
- enum TypeKind : uint8_t {
- Value = 0,
- Pointer = 1,
- Image = 2,
- Sampler = 3,
- Queue = 4,
+ enum Kind : uint8_t {
+ ByValue = 0,
+ GlobalBuffer = 1,
+ DynamicSharedPointer = 2,
+ Sampler = 3,
+ Image = 4,
+ Pipe = 5,
+ Queue = 6,
+ HiddenGlobalOffsetX = 7,
+ HiddenGlobalOffsetY = 8,
+ HiddenGlobalOffsetZ = 9,
+ HiddenNone = 10,
+ HiddenPrintfBuffer = 11,
+ HiddenDefaultQueue = 12,
+ HiddenCompletionAction = 13,
};
enum ValueType : uint16_t {
@@ -125,13 +107,86 @@ namespace RuntimeMD {
F64 = 11,
};
+ // Avoid using 'None' since it conflicts with a macro in X11 header file.
enum AccessQualifer : uint8_t {
- None = 0,
+ AccNone = 0,
ReadOnly = 1,
WriteOnly = 2,
ReadWrite = 3,
};
+
+ enum AddressSpaceQualifer : uint8_t {
+ Private = 0,
+ Global = 1,
+ Constant = 2,
+ Local = 3,
+ Generic = 4,
+ Region = 5,
+ };
} // namespace KernelArg
+
+ // Invalid values are used to indicate an optional key should not be emitted.
+ const uint8_t INVALID_ADDR_QUAL = 0xff;
+ const uint8_t INVALID_ACC_QUAL = 0xff;
+ const uint32_t INVALID_KERNEL_INDEX = ~0U;
+
+ namespace KernelArg {
+ // In-memory representation of kernel argument information.
+ struct Metadata {
+ uint32_t Size;
+ uint32_t Align;
+ uint32_t PointeeAlign;
+ uint8_t Kind;
+ uint16_t ValueType;
+ std::string TypeName;
+ std::string Name;
+ uint8_t AddrQual;
+ uint8_t AccQual;
+ uint8_t IsVolatile;
+ uint8_t IsConst;
+ uint8_t IsRestrict;
+ uint8_t IsPipe;
+ Metadata() : Size(0), Align(0), PointeeAlign(0), Kind(0), ValueType(0),
+ AddrQual(INVALID_ADDR_QUAL), AccQual(INVALID_ACC_QUAL), IsVolatile(0),
+ IsConst(0), IsRestrict(0), IsPipe(0) {}
+ };
+ }
+
+ namespace Kernel {
+ // In-memory representation of kernel information.
+ struct Metadata {
+ std::string Name;
+ std::string Language;
+ std::vector<uint8_t> LanguageVersion;
+ std::vector<uint32_t> ReqdWorkGroupSize;
+ std::vector<uint32_t> WorkGroupSizeHint;
+ std::string VecTypeHint;
+ uint32_t KernelIndex;
+ uint8_t NoPartialWorkGroups;
+ std::vector<KernelArg::Metadata> Args;
+ Metadata() : KernelIndex(INVALID_KERNEL_INDEX), NoPartialWorkGroups(0) {}
+ };
+ }
+
+ namespace Program {
+ // In-memory representation of program information.
+ struct Metadata {
+ std::vector<uint8_t> MDVersionSeq;
+ std::vector<std::string> PrintfInfo;
+ std::vector<Kernel::Metadata> Kernels;
+
+ explicit Metadata(){}
+
+ // Construct from an YAML string.
+ explicit Metadata(const std::string &YAML);
+
+ // Convert to YAML string.
+ std::string toYAML();
+
+ // Convert from YAML string.
+ static Metadata fromYAML(const std::string &S);
+ };
+ }
} // namespace RuntimeMD
} // namespace AMDGPU
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 10fa9cf..c35a67d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -13,14 +13,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUSubtarget.h"
-#include "R600ISelLowering.h"
-#include "R600InstrInfo.h"
-#include "SIFrameLowering.h"
-#include "SIISelLowering.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include <algorithm>
using namespace llvm;
@@ -31,7 +27,7 @@ using namespace llvm;
#define GET_SUBTARGETINFO_CTOR
#include "AMDGPUGenSubtargetInfo.inc"
-AMDGPUSubtarget::~AMDGPUSubtarget() {}
+AMDGPUSubtarget::~AMDGPUSubtarget() = default;
AMDGPUSubtarget &
AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
@@ -52,10 +48,18 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
ParseSubtargetFeatures(GPU, FullFS);
+ // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
+ // on VI and newer hardware to avoid assertion failures due to missing ADDR64
+ // variants of MUBUF instructions.
+ if (!hasAddr64() && !FS.contains("flat-for-global")) {
+ FlatForGlobal = true;
+ }
+
// FIXME: I don't think think Evergreen has any useful support for
// denormals, but should be checked. Should we issue a warning somewhere
// if someone tries to enable these?
if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+ FP16Denormals = false;
FP32Denormals = false;
FP64Denormals = false;
}
@@ -81,10 +85,12 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FastFMAF32(false),
HalfRate64Ops(false),
+ FP16Denormals(false),
FP32Denormals(false),
FP64Denormals(false),
FPExceptions(false),
FlatForGlobal(false),
+ UnalignedScratchAccess(false),
UnalignedBufferAccess(false),
EnableXNACK(false),
@@ -107,6 +113,10 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
SGPRInitBug(false),
HasSMemRealTime(false),
Has16BitInsts(false),
+ HasMovrel(false),
+ HasVGPRIndexMode(false),
+ HasScalarStores(false),
+ HasInv2PiInlineImm(false),
FlatAddressSpace(false),
R600ALUInst(false),
@@ -114,6 +124,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
CFALUBug(false),
HasVertexCache(false),
TexVTXClauseSize(0),
+ ScalarizeGlobal(false),
FeatureDisable(false),
InstrItins(getInstrItineraryForCPU(GPU)) {
@@ -178,6 +189,86 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
return 1;
}
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
+ const Function &F) const {
+ // Default minimum/maximum flat work group sizes.
+ std::pair<unsigned, unsigned> Default =
+ AMDGPU::isCompute(F.getCallingConv()) ?
+ std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
+ getWavefrontSize() * 4) :
+ std::pair<unsigned, unsigned>(1, getWavefrontSize());
+
+ // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
+ // starts using "amdgpu-flat-work-group-size" attribute.
+ Default.second = AMDGPU::getIntegerAttribute(
+ F, "amdgpu-max-work-group-size", Default.second);
+ Default.first = std::min(Default.first, Default.second);
+
+ // Requested minimum/maximum flat work group sizes.
+ std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
+ F, "amdgpu-flat-work-group-size", Default);
+
+ // Make sure requested minimum is less than requested maximum.
+ if (Requested.first > Requested.second)
+ return Default;
+
+ // Make sure requested values do not violate subtarget's specifications.
+ if (Requested.first < getMinFlatWorkGroupSize())
+ return Default;
+ if (Requested.second > getMaxFlatWorkGroupSize())
+ return Default;
+
+ return Requested;
+}
+
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
+ const Function &F) const {
+ // Default minimum/maximum number of waves per execution unit.
+ std::pair<unsigned, unsigned> Default(1, 0);
+
+ // Default/requested minimum/maximum flat work group sizes.
+ std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
+
+ // If minimum/maximum flat work group sizes were explicitly requested using
+ // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
+ // number of waves per execution unit to values implied by requested
+ // minimum/maximum flat work group sizes.
+ unsigned MinImpliedByFlatWorkGroupSize =
+ getMaxWavesPerEU(FlatWorkGroupSizes.second);
+ bool RequestedFlatWorkGroupSize = false;
+
+ // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
+ // starts using "amdgpu-flat-work-group-size" attribute.
+ if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
+ F.hasFnAttribute("amdgpu-flat-work-group-size")) {
+ Default.first = MinImpliedByFlatWorkGroupSize;
+ RequestedFlatWorkGroupSize = true;
+ }
+
+ // Requested minimum/maximum number of waves per execution unit.
+ std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
+ F, "amdgpu-waves-per-eu", Default, true);
+
+ // Make sure requested minimum is less than requested maximum.
+ if (Requested.second && Requested.first > Requested.second)
+ return Default;
+
+ // Make sure requested values do not violate subtarget's specifications.
+ if (Requested.first < getMinWavesPerEU() ||
+ Requested.first > getMaxWavesPerEU())
+ return Default;
+ if (Requested.second > getMaxWavesPerEU())
+ return Default;
+
+ // Make sure requested values are compatible with values implied by requested
+ // minimum/maximum flat work group sizes.
+ if (RequestedFlatWorkGroupSize &&
+ Requested.first > MinImpliedByFlatWorkGroupSize)
+ return Default;
+
+ return Requested;
+}
+
R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
const TargetMachine &TM) :
AMDGPUSubtarget(TT, GPU, FS, TM),
@@ -190,21 +281,7 @@ SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
AMDGPUSubtarget(TT, GPU, FS, TM),
InstrInfo(*this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
- TLInfo(TM, *this),
- GISel() {}
-
-unsigned R600Subtarget::getStackEntrySize() const {
- switch (getWavefrontSize()) {
- case 16:
- return 8;
- case 32:
- return hasCaymanISA() ? 4 : 8;
- case 64:
- return 4;
- default:
- llvm_unreachable("Illegal wavefront size.");
- }
-}
+ TLInfo(TM, *this) {}
void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const {
@@ -227,15 +304,67 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
}
-unsigned SISubtarget::getAmdKernelCodeChipID() const {
- switch (getGeneration()) {
- case SEA_ISLANDS:
- return 12;
- default:
- llvm_unreachable("ChipID unknown");
+unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
+ unsigned ExplicitArgBytes) const {
+ unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
+ if (ImplicitBytes == 0)
+ return ExplicitArgBytes;
+
+ unsigned Alignment = getAlignmentForImplicitArgPtr();
+ return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
+}
+
+unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
+ if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (SGPRs <= 80)
+ return 10;
+ if (SGPRs <= 88)
+ return 9;
+ if (SGPRs <= 100)
+ return 8;
+ return 7;
}
+ if (SGPRs <= 48)
+ return 10;
+ if (SGPRs <= 56)
+ return 9;
+ if (SGPRs <= 64)
+ return 8;
+ if (SGPRs <= 72)
+ return 7;
+ if (SGPRs <= 80)
+ return 6;
+ return 5;
}
-AMDGPU::IsaVersion SISubtarget::getIsaVersion() const {
- return AMDGPU::getIsaVersion(getFeatureBits());
+unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
+ if (VGPRs <= 24)
+ return 10;
+ if (VGPRs <= 28)
+ return 9;
+ if (VGPRs <= 32)
+ return 8;
+ if (VGPRs <= 36)
+ return 7;
+ if (VGPRs <= 40)
+ return 6;
+ if (VGPRs <= 48)
+ return 5;
+ if (VGPRs <= 64)
+ return 4;
+ if (VGPRs <= 84)
+ return 3;
+ if (VGPRs <= 128)
+ return 2;
+ return 1;
+}
+
+unsigned SISubtarget::getMaxNumSGPRs() const {
+ if (hasSGPRInitBug())
+ return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+
+ if (getGeneration() >= VOLCANIC_ISLANDS)
+ return 102;
+
+ return 104;
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 3fe61aa..0e3cb7d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -23,15 +23,22 @@
#include "SIISelLowering.h"
#include "SIFrameLowering.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/Triple.h"
#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
#define GET_SUBTARGETINFO_HEADER
#include "AMDGPUGenSubtargetInfo.inc"
namespace llvm {
-class SIMachineFunctionInfo;
class StringRef;
class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
@@ -50,9 +57,13 @@ public:
ISAVersion0_0_0,
ISAVersion7_0_0,
ISAVersion7_0_1,
+ ISAVersion7_0_2,
ISAVersion8_0_0,
ISAVersion8_0_1,
- ISAVersion8_0_3
+ ISAVersion8_0_2,
+ ISAVersion8_0_3,
+ ISAVersion8_0_4,
+ ISAVersion8_1_0,
};
protected:
@@ -70,10 +81,12 @@ protected:
bool HalfRate64Ops;
// Dynamially set bits that enable features.
+ bool FP16Denormals;
bool FP32Denormals;
bool FP64Denormals;
bool FPExceptions;
bool FlatForGlobal;
+ bool UnalignedScratchAccess;
bool UnalignedBufferAccess;
bool EnableXNACK;
bool DebuggerInsertNops;
@@ -97,40 +110,60 @@ protected:
bool SGPRInitBug;
bool HasSMemRealTime;
bool Has16BitInsts;
+ bool HasMovrel;
+ bool HasVGPRIndexMode;
+ bool HasScalarStores;
+ bool HasInv2PiInlineImm;
bool FlatAddressSpace;
bool R600ALUInst;
bool CaymanISA;
bool CFALUBug;
bool HasVertexCache;
short TexVTXClauseSize;
+ bool ScalarizeGlobal;
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable;
InstrItineraryData InstrItins;
+ SelectionDAGTargetInfo TSInfo;
public:
AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const TargetMachine &TM);
- virtual ~AMDGPUSubtarget();
+ ~AMDGPUSubtarget() override;
+
AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS);
- const AMDGPUInstrInfo *getInstrInfo() const override;
- const AMDGPUFrameLowering *getFrameLowering() const override;
- const AMDGPUTargetLowering *getTargetLowering() const override;
- const AMDGPURegisterInfo *getRegisterInfo() const override;
+ const AMDGPUInstrInfo *getInstrInfo() const override = 0;
+ const AMDGPUFrameLowering *getFrameLowering() const override = 0;
+ const AMDGPUTargetLowering *getTargetLowering() const override = 0;
+ const AMDGPURegisterInfo *getRegisterInfo() const override = 0;
const InstrItineraryData *getInstrItineraryData() const override {
return &InstrItins;
}
+ // Nothing implemented, just prevent crashes on use.
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
bool isAmdHsaOS() const {
return TargetTriple.getOS() == Triple::AMDHSA;
}
+ bool isMesa3DOS() const {
+ return TargetTriple.getOS() == Triple::Mesa3D;
+ }
+
+ bool isOpenCLEnv() const {
+ return TargetTriple.getEnvironment() == Triple::OpenCL;
+ }
+
Generation getGeneration() const {
return Gen;
}
@@ -151,6 +184,10 @@ public:
return MaxPrivateElementSize;
}
+ bool has16BitInsts() const {
+ return Has16BitInsts;
+ }
+
bool hasHWFP64() const {
return FP64;
}
@@ -230,6 +267,10 @@ public:
return DumpCode;
}
+ bool enableIEEEBit(const MachineFunction &MF) const {
+ return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
+ }
+
/// Return the amount of LDS that can be used that will not restrict the
/// occupancy lower than WaveCount.
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
@@ -238,6 +279,9 @@ public:
/// the given LDS memory size is the only constraint.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
+ bool hasFP16Denormals() const {
+ return FP16Denormals;
+ }
bool hasFP32Denormals() const {
return FP32Denormals;
@@ -259,22 +303,43 @@ public:
return UnalignedBufferAccess;
}
+ bool hasUnalignedScratchAccess() const {
+ return UnalignedScratchAccess;
+ }
+
bool isXNACKEnabled() const {
return EnableXNACK;
}
- unsigned getMaxWavesPerCU() const {
- if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
- return 10;
+ bool isMesaKernel(const MachineFunction &MF) const {
+ return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
+ }
- // FIXME: Not sure what this is for other subtagets.
- return 8;
+ // Covers VS/PS/CS graphics shaders
+ bool isMesaGfxShader(const MachineFunction &MF) const {
+ return isMesa3DOS() && AMDGPU::isShader(MF.getFunction()->getCallingConv());
+ }
+
+ bool isAmdCodeObjectV2(const MachineFunction &MF) const {
+ return isAmdHsaOS() || isMesaKernel(MF);
}
/// \brief Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
- unsigned getExplicitKernelArgOffset() const {
- return isAmdHsaOS() ? 0 : 36;
+ unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
+ return isAmdCodeObjectV2(MF) ? 0 : 36;
+ }
+
+ unsigned getAlignmentForImplicitArgPtr() const {
+ return isAmdHsaOS() ? 8 : 4;
+ }
+
+ unsigned getImplicitArgNumBytes(const MachineFunction &MF) const {
+ if (isMesaKernel(MF))
+ return 16;
+ if (isAmdHsaOS() && isOpenCLEnv())
+ return 32;
+ return 0;
}
unsigned getStackAlignment() const {
@@ -289,6 +354,92 @@ public:
bool enableSubRegLiveness() const override {
return true;
}
+
+ /// \returns Number of execution units per compute unit supported by the
+ /// subtarget.
+ unsigned getEUsPerCU() const {
+ return 4;
+ }
+
+ /// \returns Maximum number of work groups per compute unit supported by the
+ /// subtarget and limited by given flat work group size.
+ unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
+ if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ return 8;
+ return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16;
+ }
+
+ /// \returns Maximum number of waves per compute unit supported by the
+ /// subtarget without any kind of limitation.
+ unsigned getMaxWavesPerCU() const {
+ return getMaxWavesPerEU() * getEUsPerCU();
+ }
+
+ /// \returns Maximum number of waves per compute unit supported by the
+ /// subtarget and limited by given flat work group size.
+ unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
+ return getWavesPerWorkGroup(FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum number of waves per execution unit supported by the
+ /// subtarget.
+ unsigned getMinWavesPerEU() const {
+ return 1;
+ }
+
+ /// \returns Maximum number of waves per execution unit supported by the
+ /// subtarget without any kind of limitation.
+ unsigned getMaxWavesPerEU() const {
+ if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ return 8;
+ // FIXME: Need to take scratch memory into account.
+ return 10;
+ }
+
+ /// \returns Maximum number of waves per execution unit supported by the
+ /// subtarget and limited by given flat work group size.
+ unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
+ return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) /
+ getEUsPerCU();
+ }
+
+ /// \returns Minimum flat work group size supported by the subtarget.
+ unsigned getMinFlatWorkGroupSize() const {
+ return 1;
+ }
+
+ /// \returns Maximum flat work group size supported by the subtarget.
+ unsigned getMaxFlatWorkGroupSize() const {
+ return 2048;
+ }
+
+ /// \returns Number of waves per work group given the flat work group size.
+ unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
+ return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
+ }
+
+ void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
+ bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+
+ /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
+ /// for function \p F, or minimum/maximum flat work group sizes explicitly
+ /// requested using "amdgpu-flat-work-group-size" attribute attached to
+ /// function \p F.
+ ///
+ /// \returns Subtarget's default values if explicitly requested values cannot
+ /// be converted to integer, or violate subtarget's specifications.
+ std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
+
+ /// \returns Subtarget's default pair of minimum/maximum number of waves per
+ /// execution unit for function \p F, or minimum/maximum number of waves per
+ /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
+ /// attached to function \p F.
+ ///
+ /// \returns Subtarget's default values if explicitly requested values cannot
+ /// be converted to integer, violate subtarget's specifications, or are not
+ /// compatible with minimum/maximum number of waves limited by flat work group
+ /// size, register usage, and/or lds usage.
+ std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
};
class R600Subtarget final : public AMDGPUSubtarget {
@@ -328,14 +479,14 @@ public:
short getTexVTXClauseSize() const {
return TexVTXClauseSize;
}
-
- unsigned getStackEntrySize() const;
};
class SISubtarget final : public AMDGPUSubtarget {
public:
enum {
- FIXED_SGPR_COUNT_FOR_INIT_BUG = 80
+ // The closed Vulkan driver sets 96, which limits the wave count to 8 but
+ // doesn't spill SGPRs as much as when 80 is set.
+ FIXED_SGPR_COUNT_FOR_INIT_BUG = 96
};
private:
@@ -378,10 +529,6 @@ public:
bool isVGPRSpillingEnabled(const Function& F) const;
- unsigned getAmdKernelCodeChipID() const;
-
- AMDGPU::IsaVersion getIsaVersion() const;
-
unsigned getMaxNumUserSGPRs() const {
return 16;
}
@@ -394,8 +541,24 @@ public:
return HasSMemRealTime;
}
- bool has16BitInsts() const {
- return Has16BitInsts;
+ bool hasMovrel() const {
+ return HasMovrel;
+ }
+
+ bool hasVGPRIndexMode() const {
+ return HasVGPRIndexMode;
+ }
+
+ bool hasScalarCompareEq64() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
+ bool hasScalarStores() const {
+ return HasScalarStores;
+ }
+
+ bool hasInv2PiInlineImm() const {
+ return HasInv2PiInlineImm;
}
bool enableSIScheduler() const {
@@ -426,37 +589,28 @@ public:
bool hasSGPRInitBug() const {
return SGPRInitBug;
}
-};
-
-
-inline const AMDGPUInstrInfo *AMDGPUSubtarget::getInstrInfo() const {
- if (getGeneration() >= SOUTHERN_ISLANDS)
- return static_cast<const SISubtarget *>(this)->getInstrInfo();
-
- return static_cast<const R600Subtarget *>(this)->getInstrInfo();
-}
-inline const AMDGPUFrameLowering *AMDGPUSubtarget::getFrameLowering() const {
- if (getGeneration() >= SOUTHERN_ISLANDS)
- return static_cast<const SISubtarget *>(this)->getFrameLowering();
+ bool has12DWordStoreHazard() const {
+ return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
+ }
- return static_cast<const R600Subtarget *>(this)->getFrameLowering();
-}
+ unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const;
-inline const AMDGPUTargetLowering *AMDGPUSubtarget::getTargetLowering() const {
- if (getGeneration() >= SOUTHERN_ISLANDS)
- return static_cast<const SISubtarget *>(this)->getTargetLowering();
+ /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
+ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
- return static_cast<const R600Subtarget *>(this)->getTargetLowering();
-}
+ /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
+ unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
-inline const AMDGPURegisterInfo *AMDGPUSubtarget::getRegisterInfo() const {
- if (getGeneration() >= SOUTHERN_ISLANDS)
- return static_cast<const SISubtarget *>(this)->getRegisterInfo();
+ /// \returns True if waitcnt instruction is needed before barrier instruction,
+ /// false otherwise.
+ bool needWaitcntBeforeBarrier() const {
+ return true;
+ }
- return static_cast<const R600Subtarget *>(this)->getRegisterInfo();
-}
+ unsigned getMaxNumSGPRs() const;
+};
-} // End namespace llvm
+} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b2d4e11..d8a0c71 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -18,28 +18,32 @@
#include "AMDGPUCallLowering.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
-#include "R600ISelLowering.h"
-#include "R600InstrInfo.h"
+#include "GCNSchedStrategy.h"
#include "R600MachineScheduler.h"
-#include "SIISelLowering.h"
-#include "SIInstrInfo.h"
-
-#include "llvm/Analysis/Passes.h"
+#include "SIMachineScheduler.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/IR/LegacyPassManager.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_os_ostream.h"
#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Vectorize.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <memory>
using namespace llvm;
@@ -64,13 +68,20 @@ static cl::opt<bool> EnableR600IfConvert(
static cl::opt<bool> EnableLoadStoreVectorizer(
"amdgpu-load-store-vectorizer",
cl::desc("Enable load store vectorizer"),
+ cl::init(true),
+ cl::Hidden);
+
+// Option to to control global loads scalarization
+static cl::opt<bool> ScalarizeGlobal(
+ "amdgpu-scalarize-global-loads",
+ cl::desc("Enable global load scalarization"),
cl::init(false),
cl::Hidden);
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
- RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
- RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
+ RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
+ RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
PassRegistry *PR = PassRegistry::getPassRegistry();
initializeSILowerI1CopiesPass(*PR);
@@ -83,20 +94,36 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
+ initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
- initializeSIDebuggerInsertNopsPass(*PR);
initializeSIInsertWaitsPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
+ initializeSIInsertSkipsPass(*PR);
initializeSIDebuggerInsertNopsPass(*PR);
+ initializeSIOptimizeExecMaskingPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
- return make_unique<AMDGPUTargetObjectFile>();
+ return llvm::make_unique<AMDGPUTargetObjectFile>();
}
static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
- return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
+ return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
+}
+
+static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
+ return new SIScheduleDAGMI(C);
+}
+
+static ScheduleDAGInstrs *
+createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+ ScheduleDAGMILive *DAG =
+ new ScheduleDAGMILive(C,
+ llvm::make_unique<GCNMaxOccupancySchedStrategy>(C));
+ DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ return DAG;
}
static MachineSchedRegistry
@@ -107,6 +134,11 @@ static MachineSchedRegistry
SISchedRegistry("si", "Run SI's custom scheduler",
createSIMachineScheduler);
+static MachineSchedRegistry
+GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
+ "Run GCN scheduler to maximize occupancy",
+ createGCNMaxOccupancyMachineScheduler);
+
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
@@ -147,13 +179,11 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OptLevel)
: LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
- TLOF(createTLOF(getTargetTriple())),
- IntrinsicInfo() {
- setRequiresStructuredCFG(true);
+ TLOF(createTLOF(getTargetTriple())) {
initAsmInfo();
}
-AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
+AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
Attribute GPUAttr = F.getFnAttribute("target-cpu");
@@ -169,6 +199,10 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
FSAttr.getValueAsString();
}
+void AMDGPUTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
+ PM.add(createAMDGPUUnifyMetadataPass());
+}
+
//===----------------------------------------------------------------------===//
// R600 Target Machine (R600 -> Cayman)
//===----------------------------------------------------------------------===//
@@ -178,7 +212,9 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
TargetOptions Options,
Optional<Reloc::Model> RM,
CodeModel::Model CM, CodeGenOpt::Level OL)
- : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+ : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
+ setRequiresStructuredCFG(true);
+}
const R600Subtarget *R600TargetMachine::getSubtargetImpl(
const Function &F) const {
@@ -206,13 +242,15 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
#ifdef LLVM_BUILD_GLOBAL_ISEL
namespace {
+
struct SIGISelActualAccessor : public GISelAccessor {
std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
const AMDGPUCallLowering *getCallLowering() const override {
return CallLoweringInfo.get();
}
};
-} // End anonymous namespace.
+
+} // end anonymous namespace
#endif
GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
@@ -248,6 +286,8 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
I->setGISelAccessor(*GISel);
}
+ I->setScalarizeGlobalBehavior(ScalarizeGlobal);
+
return I.get();
}
@@ -261,7 +301,6 @@ class AMDGPUPassConfig : public TargetPassConfig {
public:
AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
: TargetPassConfig(TM, PM) {
-
// Exceptions and StackMaps are not supported, so these passes will never do
// anything.
disablePass(&StackMapLivenessID);
@@ -272,6 +311,14 @@ public:
return getTM<AMDGPUTargetMachine>();
}
+ ScheduleDAGInstrs *
+ createMachineScheduler(MachineSchedContext *C) const override {
+ ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+ DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ return DAG;
+ }
+
void addEarlyCSEOrGVNPass();
void addStraightLineScalarOptimizationPasses();
void addIRPasses() override;
@@ -284,7 +331,7 @@ public:
class R600PassConfig final : public AMDGPUPassConfig {
public:
R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
- : AMDGPUPassConfig(TM, PM) { }
+ : AMDGPUPassConfig(TM, PM) {}
ScheduleDAGInstrs *createMachineScheduler(
MachineSchedContext *C) const override {
@@ -300,7 +347,7 @@ public:
class GCNPassConfig final : public AMDGPUPassConfig {
public:
GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
- : AMDGPUPassConfig(TM, PM) { }
+ : AMDGPUPassConfig(TM, PM) {}
GCNTargetMachine &getGCNTargetMachine() const {
return getTM<GCNTargetMachine>();
@@ -315,16 +362,19 @@ public:
bool addInstSelector() override;
#ifdef LLVM_BUILD_GLOBAL_ISEL
bool addIRTranslator() override;
+ bool addLegalizeMachineIR() override;
bool addRegBankSelect() override;
+ bool addGlobalInstructionSelect() override;
#endif
void addFastRegAlloc(FunctionPass *RegAllocPass) override;
void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
void addPreRegAlloc() override;
+ void addPostRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
};
-} // End of anonymous namespace
+} // end anonymous namespace
TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
return TargetIRAnalysis([this](const Function &F) {
@@ -363,7 +413,7 @@ void AMDGPUPassConfig::addIRPasses() {
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
- addPass(createAlwaysInlinerPass());
+ addPass(createAlwaysInlinerLegacyPass());
// We need to add the barrier noop pass, otherwise adding the function
// inlining pass will cause all of the PassConfigs passes to be run
// one function at a time, which means if we have a nodule with two
@@ -380,9 +430,9 @@ void AMDGPUPassConfig::addIRPasses() {
if (EnableSROA)
addPass(createSROAPass());
- }
- addStraightLineScalarOptimizationPasses();
+ addStraightLineScalarOptimizationPasses();
+ }
TargetPassConfig::addIRPasses();
@@ -415,7 +465,7 @@ bool AMDGPUPassConfig::addPreISel() {
}
bool AMDGPUPassConfig::addInstSelector() {
- addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
+ addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
return false;
}
@@ -468,7 +518,7 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
if (ST.enableSIScheduler())
return createSIMachineScheduler(C);
- return nullptr;
+ return createGCNMaxOccupancyMachineScheduler(C);
}
bool GCNPassConfig::addPreISel() {
@@ -498,6 +548,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
// XXX - Can we get away without running DeadMachineInstructionElim again?
addPass(&SIFoldOperandsID);
addPass(&DeadMachineInstructionElimID);
+ addPass(&SILoadStoreOptimizerID);
}
void GCNPassConfig::addIRPasses() {
@@ -520,43 +571,54 @@ bool GCNPassConfig::addIRTranslator() {
return false;
}
+bool GCNPassConfig::addLegalizeMachineIR() {
+ return false;
+}
+
bool GCNPassConfig::addRegBankSelect() {
return false;
}
+
+bool GCNPassConfig::addGlobalInstructionSelect() {
+ return false;
+}
#endif
void GCNPassConfig::addPreRegAlloc() {
- // This needs to be run directly before register allocation because
- // earlier passes might recompute live intervals.
- // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
- if (getOptLevel() > CodeGenOpt::None) {
- insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
- }
-
- if (getOptLevel() > CodeGenOpt::None) {
- // Don't do this with no optimizations since it throws away debug info by
- // merging nonadjacent loads.
-
- // This should be run after scheduling, but before register allocation. It
- // also need extra copies to the address operand to be eliminated.
-
- // FIXME: Move pre-RA and remove extra reg coalescer run.
- insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
- insertPass(&MachineSchedulerID, &RegisterCoalescerID);
- }
-
addPass(createSIShrinkInstructionsPass());
addPass(createSIWholeQuadModePass());
}
void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+ // FIXME: We have to disable the verifier here because of PHIElimination +
+ // TwoAddressInstructions disabling it.
+
+ // This must be run immediately after phi elimination and before
+ // TwoAddressInstructions, otherwise the processing of the tied operand of
+ // SI_ELSE will introduce a copy of the tied operand source after the else.
+ insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+
TargetPassConfig::addFastRegAlloc(RegAllocPass);
}
void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+ // This needs to be run directly before register allocation because earlier
+ // passes might recompute live intervals.
+ insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
+
+ // This must be run immediately after phi elimination and before
+ // TwoAddressInstructions, otherwise the processing of the tied operand of
+ // SI_ELSE will introduce a copy of the tied operand source after the else.
+ insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+
TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
}
+void GCNPassConfig::addPostRegAlloc() {
+ addPass(&SIOptimizeExecMaskingID);
+ TargetPassConfig::addPostRegAlloc();
+}
+
void GCNPassConfig::addPreSched2() {
}
@@ -573,8 +635,9 @@ void GCNPassConfig::addPreEmitPass() {
addPass(createSIInsertWaitsPass());
addPass(createSIShrinkInstructionsPass());
- addPass(createSILowerControlFlowPass());
+ addPass(&SIInsertSkipsPassID);
addPass(createSIDebuggerInsertNopsPass());
+ addPass(&BranchRelaxationPassID);
}
TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index b0eb3a9..9496773 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -17,6 +17,13 @@
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
+#include <memory>
namespace llvm {
@@ -37,10 +44,10 @@ public:
StringRef FS, TargetOptions Options,
Optional<Reloc::Model> RM, CodeModel::Model CM,
CodeGenOpt::Level OL);
- ~AMDGPUTargetMachine();
+ ~AMDGPUTargetMachine() override;
const AMDGPUSubtarget *getSubtargetImpl() const;
- const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override;
+ const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override = 0;
const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
return &IntrinsicInfo;
@@ -50,6 +57,7 @@ public:
TargetLoweringObjectFile *getObjFileLowering() const override {
return TLOF.get();
}
+ void addEarlyAsPossiblePasses(PassManagerBase &PM) override;
};
//===----------------------------------------------------------------------===//
@@ -90,13 +98,6 @@ public:
const SISubtarget *getSubtargetImpl(const Function &) const override;
};
-inline const AMDGPUSubtarget *AMDGPUTargetMachine::getSubtargetImpl(
- const Function &F) const {
- if (getTargetTriple().getArch() == Triple::amdgcn)
- return static_cast<const GCNTargetMachine *>(this)->getSubtargetImpl(F);
- return static_cast<const R600TargetMachine *>(this)->getSubtargetImpl(F);
-}
+} // end namespace llvm
-} // End namespace llvm
-
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index 03d1e2c..1fddc88 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -9,10 +9,10 @@
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPU.h"
-#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/Support/ELF.h"
+#include "Utils/AMDGPUBaseInfo.h"
using namespace llvm;
@@ -20,12 +20,11 @@ using namespace llvm;
// Generic Object File
//===----------------------------------------------------------------------===//
-MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
- SectionKind Kind,
- Mangler &Mang,
- const TargetMachine &TM) const {
- if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV))
+MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+ if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO) &&
+ AMDGPU::shouldEmitConstantsToTextSection(TM.getTargetTriple()))
return TextSection;
- return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
+ return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index f530e09..de32778 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -23,8 +23,7 @@ namespace llvm {
class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
public:
- MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
- Mangler &Mang,
+ MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
const TargetMachine &TM) const override;
};
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 3d630fe..e904870 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -80,7 +80,7 @@ unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
return Vector ? 0 : 32;
}
-unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) {
+unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
switch (AddrSpace) {
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::CONSTANT_ADDRESS:
@@ -110,7 +110,7 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
int AMDGPUTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
EVT OrigTy = TLI->getValueType(DL, Ty);
if (!OrigTy.isSimple()) {
@@ -241,6 +241,7 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::amdgcn_workitem_id_z:
+ case Intrinsic::amdgcn_interp_mov:
case Intrinsic::amdgcn_interp_p1:
case Intrinsic::amdgcn_interp_p2:
case Intrinsic::amdgcn_mbcnt_hi:
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index a82a074..0d83b2a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -64,13 +64,6 @@ public:
ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
- // Provide value semantics. MSVC requires that we spell all of these out.
- AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg)
- : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
- AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg)
- : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
- TLI(std::move(Arg.TLI)) {}
-
bool hasBranchDivergence() { return true; }
void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
@@ -82,7 +75,7 @@ public:
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);
- unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace);
+ unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
unsigned getMaxInterleaveFactor(unsigned VF);
int getArithmeticInstrCost(
@@ -90,7 +83,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
unsigned getCFInstrCost(unsigned Opcode);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
new file mode 100644
index 0000000..bf501a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -0,0 +1,149 @@
+//===-- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// \brief This pass that unifies multiple OpenCL metadata due to linking.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+ namespace kOCLMD {
+ const char SpirVer[] = "opencl.spir.version";
+ const char OCLVer[] = "opencl.ocl.version";
+ const char UsedExt[] = "opencl.used.extensions";
+ const char UsedOptCoreFeat[] = "opencl.used.optional.core.features";
+ const char CompilerOptions[] = "opencl.compiler.options";
+ const char LLVMIdent[] = "llvm.ident";
+ }
+
+ /// \brief Unify multiple OpenCL metadata due to linking.
+ class AMDGPUUnifyMetadata : public FunctionPass {
+ public:
+ static char ID;
+ explicit AMDGPUUnifyMetadata() : FunctionPass(ID) {};
+
+ private:
+ // This should really be a module pass but we have to run it as early
+ // as possible, so given function passes are executed first and
+ // TargetMachine::addEarlyAsPossiblePasses() expects only function passes
+ // it has to be a function pass.
+ virtual bool runOnModule(Module &M);
+
+ // \todo: Convert to a module pass.
+ virtual bool runOnFunction(Function &F);
+
+ /// \brief Unify version metadata.
+ /// \return true if changes are made.
+ /// Assume the named metadata has operands each of which is a pair of
+ /// integer constant, e.g.
+ /// !Name = {!n1, !n2}
+ /// !n1 = {i32 1, i32 2}
+ /// !n2 = {i32 2, i32 0}
+ /// Keep the largest version as the sole operand if PickFirst is false.
+ /// Otherwise pick it from the first value, representing kernel module.
+ bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) {
+ auto NamedMD = M.getNamedMetadata(Name);
+ if (!NamedMD || NamedMD->getNumOperands() <= 1)
+ return false;
+ MDNode *MaxMD = nullptr;
+ auto MaxVer = 0U;
+ for (const auto &VersionMD : NamedMD->operands()) {
+ assert(VersionMD->getNumOperands() == 2);
+ auto CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0));
+ auto VersionMajor = CMajor->getZExtValue();
+ auto CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1));
+ auto VersionMinor = CMinor->getZExtValue();
+ auto Ver = (VersionMajor * 100) + (VersionMinor * 10);
+ if (Ver > MaxVer) {
+ MaxVer = Ver;
+ MaxMD = VersionMD;
+ }
+ if (PickFirst)
+ break;
+ }
+ NamedMD->eraseFromParent();
+ NamedMD = M.getOrInsertNamedMetadata(Name);
+ NamedMD->addOperand(MaxMD);
+ return true;
+ }
+
+ /// \brief Unify version metadata.
+ /// \return true if changes are made.
+ /// Assume the named metadata has operands each of which is a list e.g.
+ /// !Name = {!n1, !n2}
+ /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}}
+ /// !n2 = !{!"cl_khr_image"}
+ /// Combine it into a single list with unique operands.
+ bool unifyExtensionMD(Module &M, StringRef Name) {
+ auto NamedMD = M.getNamedMetadata(Name);
+ if (!NamedMD || NamedMD->getNumOperands() == 1)
+ return false;
+
+ SmallVector<Metadata *, 4> All;
+ for (const auto &MD : NamedMD->operands())
+ for (const auto &Op : MD->operands())
+ if (std::find(All.begin(), All.end(), Op.get()) == All.end())
+ All.push_back(Op.get());
+
+ NamedMD->eraseFromParent();
+ NamedMD = M.getOrInsertNamedMetadata(Name);
+ for (const auto &MD : All)
+ NamedMD->addOperand(MDNode::get(M.getContext(), MD));
+
+ return true;
+ }
+};
+
+} // end anonymous namespace
+
+char AMDGPUUnifyMetadata::ID = 0;
+
+char &llvm::AMDGPUUnifyMetadataID = AMDGPUUnifyMetadata::ID;
+
+INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata",
+ "Unify multiple OpenCL metadata due to linking",
+ false, false)
+
+FunctionPass* llvm::createAMDGPUUnifyMetadataPass() {
+ return new AMDGPUUnifyMetadata();
+}
+
+bool AMDGPUUnifyMetadata::runOnModule(Module &M) {
+ const char* Vers[] = {
+ kOCLMD::SpirVer,
+ kOCLMD::OCLVer
+ };
+ const char* Exts[] = {
+ kOCLMD::UsedExt,
+ kOCLMD::UsedOptCoreFeat,
+ kOCLMD::CompilerOptions,
+ kOCLMD::LLVMIdent
+ };
+
+ bool Changed = false;
+
+ for (auto &I : Vers)
+ Changed |= unifyVersionMD(M, I, true);
+
+ for (auto &I : Exts)
+ Changed |= unifyExtensionMD(M, I);
+
+ return Changed;
+}
+
+bool AMDGPUUnifyMetadata::runOnFunction(Function &F) {
+ return runOnModule(*F.getParent());
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 21de763..7faeccd 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -18,7 +18,6 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -139,16 +138,15 @@ public:
initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "AMDGPU Control Flow Graph structurizer Pass";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<MachineFunctionAnalysis>();
- AU.addRequired<MachineFunctionAnalysis>();
AU.addRequired<MachineDominatorTree>();
AU.addRequired<MachinePostDominatorTree>();
AU.addRequired<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
}
/// Perform the CFG structurization
@@ -220,7 +218,8 @@ protected:
bool needMigrateBlock(MachineBasicBlock *MBB) const;
// Utility Functions
- void reversePredicateSetter(MachineBasicBlock::iterator I);
+ void reversePredicateSetter(MachineBasicBlock::iterator I,
+ MachineBasicBlock &MBB);
/// Compute the reversed DFS post order of Blocks
void orderBlocks(MachineFunction *MF);
@@ -422,26 +421,24 @@ bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
}
void AMDGPUCFGStructurizer::reversePredicateSetter(
- MachineBasicBlock::iterator I) {
- assert(static_cast<MachineInstr *>(I) && "Expected valid iterator");
+ MachineBasicBlock::iterator I, MachineBasicBlock &MBB) {
+ assert(I.isValid() && "Expected valid iterator");
for (;; --I) {
+ if (I == MBB.end())
+ continue;
if (I->getOpcode() == AMDGPU::PRED_X) {
- switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
- case OPCODE_IS_ZERO_INT:
- static_cast<MachineInstr *>(I)->getOperand(2)
- .setImm(OPCODE_IS_NOT_ZERO_INT);
+ switch (I->getOperand(2).getImm()) {
+ case AMDGPU::PRED_SETE_INT:
+ I->getOperand(2).setImm(AMDGPU::PRED_SETNE_INT);
return;
- case OPCODE_IS_NOT_ZERO_INT:
- static_cast<MachineInstr *>(I)->getOperand(2)
- .setImm(OPCODE_IS_ZERO_INT);
+ case AMDGPU::PRED_SETNE_INT:
+ I->getOperand(2).setImm(AMDGPU::PRED_SETE_INT);
return;
- case OPCODE_IS_ZERO:
- static_cast<MachineInstr *>(I)->getOperand(2)
- .setImm(OPCODE_IS_NOT_ZERO);
+ case AMDGPU::PRED_SETE:
+ I->getOperand(2).setImm(AMDGPU::PRED_SETNE);
return;
- case OPCODE_IS_NOT_ZERO:
- static_cast<MachineInstr *>(I)->getOperand(2)
- .setImm(OPCODE_IS_ZERO);
+ case AMDGPU::PRED_SETNE:
+ I->getOperand(2).setImm(AMDGPU::PRED_SETE);
return;
default:
llvm_unreachable("PRED_X Opcode invalid!");
@@ -841,7 +838,7 @@ bool AMDGPUCFGStructurizer::run() {
} //while, "one iteration" over the function.
MachineBasicBlock *EntryMBB =
- &*GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+ *GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
if (EntryMBB->succ_size() == 0) {
Finish = true;
DEBUG(
@@ -864,7 +861,7 @@ bool AMDGPUCFGStructurizer::run() {
} while (!Finish && MakeProgress);
// Misc wrap up to maintain the consistency of the Function representation.
- wrapup(&*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
+ wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
// Detach retired Block, release memory.
for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
@@ -908,9 +905,9 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
//walk through all the block in func to check for unreachable
typedef GraphTraits<MachineFunction *> GTM;
- MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
+ auto It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
for (; It != E; ++It) {
- MachineBasicBlock *MBB = &(*It);
+ MachineBasicBlock *MBB = *It;
SccNum = getSCCNum(MBB);
if (SccNum == INVALIDSCCNUM)
dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
@@ -995,7 +992,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
// Triangle pattern, true is empty
// We reverse the predicate to make a triangle, empty false pattern;
std::swap(TrueMBB, FalseMBB);
- reversePredicateSetter(MBB->end());
+ reversePredicateSetter(MBB->end(), *MBB);
LandBlk = FalseMBB;
FalseMBB = nullptr;
} else if (FalseMBB->succ_size() == 1
@@ -1505,7 +1502,7 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
MachineBasicBlock::iterator I = BranchMI;
if (TrueBranch != LandMBB)
- reversePredicateSetter(I);
+ reversePredicateSetter(I, *I->getParent());
insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
insertInstrBefore(I, AMDGPU::BREAK);
insertInstrBefore(I, AMDGPU::ENDIF);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index efcf1b2..3cf9a1d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -15,38 +15,62 @@
#include "Utils/AMDKernelCodeTUtils.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ELF.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SMLoc.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
using namespace llvm;
+using namespace llvm::AMDGPU;
namespace {
-struct OptionalOperand;
+class AMDGPUAsmParser;
enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_TTMP, IS_SPECIAL };
+//===----------------------------------------------------------------------===//
+// Operand
+//===----------------------------------------------------------------------===//
+
class AMDGPUOperand : public MCParsedAsmOperand {
enum KindTy {
Token,
@@ -56,16 +80,18 @@ class AMDGPUOperand : public MCParsedAsmOperand {
} Kind;
SMLoc StartLoc, EndLoc;
+ const AMDGPUAsmParser *AsmParser;
public:
- AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+ AMDGPUOperand(enum KindTy Kind_, const AMDGPUAsmParser *AsmParser_)
+ : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {}
typedef std::unique_ptr<AMDGPUOperand> Ptr;
struct Modifiers {
- bool Abs;
- bool Neg;
- bool Sext;
+ bool Abs = false;
+ bool Neg = false;
+ bool Sext = false;
bool hasFPModifiers() const { return Abs || Neg; }
bool hasIntModifiers() const { return Sext; }
@@ -126,8 +152,15 @@ public:
ImmTyDA,
ImmTyR128,
ImmTyLWE,
+ ImmTyExpTgt,
+ ImmTyExpCompr,
+ ImmTyExpVM,
ImmTyHwreg,
+ ImmTyOff,
ImmTySendMsg,
+ ImmTyInterpSlot,
+ ImmTyInterpAttr,
+ ImmTyAttrChan
};
struct TokOp {
@@ -136,18 +169,16 @@ public:
};
struct ImmOp {
- bool IsFPImm;
- ImmTy Type;
int64_t Val;
+ ImmTy Type;
+ bool IsFPImm;
Modifiers Mods;
};
struct RegOp {
unsigned RegNo;
- Modifiers Mods;
- const MCRegisterInfo *TRI;
- const MCSubtargetInfo *STI;
bool IsForcedVOP3;
+ Modifiers Mods;
};
union {
@@ -175,41 +206,66 @@ public:
return Kind == Immediate;
}
- bool isInlinableImm() const {
- if (!isImmTy(ImmTyNone)) {
- // Only plain immediates are inlinable (e.g. "clamp" attribute is not)
- return false;
- }
- // TODO: We should avoid using host float here. It would be better to
- // check the float bit values which is what a few other places do.
- // We've had bot failures before due to weird NaN support on mips hosts.
- const float F = BitsToFloat(Imm.Val);
- // TODO: Add 1/(2*pi) for VI
- return (Imm.Val <= 64 && Imm.Val >= -16) ||
- (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 ||
- F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0);
- }
+ bool isInlinableImm(MVT type) const;
+ bool isLiteralImm(MVT type) const;
bool isRegKind() const {
return Kind == Register;
}
bool isReg() const override {
- return isRegKind() && !Reg.Mods.hasModifiers();
+ return isRegKind() && !hasModifiers();
+ }
+
+ bool isRegOrImmWithInputMods(MVT type) const {
+ return isRegKind() || isInlinableImm(type);
+ }
+
+ bool isRegOrImmWithInt16InputMods() const {
+ return isRegOrImmWithInputMods(MVT::i16);
+ }
+
+ bool isRegOrImmWithInt32InputMods() const {
+ return isRegOrImmWithInputMods(MVT::i32);
+ }
+
+ bool isRegOrImmWithInt64InputMods() const {
+ return isRegOrImmWithInputMods(MVT::i64);
+ }
+
+ bool isRegOrImmWithFP16InputMods() const {
+ return isRegOrImmWithInputMods(MVT::f16);
}
- bool isRegOrImmWithInputMods() const {
- return isRegKind() || isInlinableImm();
+ bool isRegOrImmWithFP32InputMods() const {
+ return isRegOrImmWithInputMods(MVT::f32);
+ }
+
+ bool isRegOrImmWithFP64InputMods() const {
+ return isRegOrImmWithInputMods(MVT::f64);
+ }
+
+ bool isVReg() const {
+ return isRegClass(AMDGPU::VGPR_32RegClassID) ||
+ isRegClass(AMDGPU::VReg_64RegClassID) ||
+ isRegClass(AMDGPU::VReg_96RegClassID) ||
+ isRegClass(AMDGPU::VReg_128RegClassID) ||
+ isRegClass(AMDGPU::VReg_256RegClassID) ||
+ isRegClass(AMDGPU::VReg_512RegClassID);
+ }
+
+ bool isVReg32OrOff() const {
+ return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
}
bool isImmTy(ImmTy ImmT) const {
return isImm() && Imm.Type == ImmT;
}
-
+
bool isImmModifier() const {
return isImm() && Imm.Type != ImmTyNone;
}
-
+
bool isClampSI() const { return isImmTy(ImmTyClampSI); }
bool isOModSI() const { return isImmTy(ImmTyOModSI); }
bool isDMask() const { return isImmTy(ImmTyDMask); }
@@ -217,6 +273,10 @@ public:
bool isDA() const { return isImmTy(ImmTyDA); }
bool isR128() const { return isImmTy(ImmTyUNorm); }
bool isLWE() const { return isImmTy(ImmTyLWE); }
+ bool isOff() const { return isImmTy(ImmTyOff); }
+ bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
+ bool isExpVM() const { return isImmTy(ImmTyExpVM); }
+ bool isExpCompr() const { return isImmTy(ImmTyExpCompr); }
bool isOffen() const { return isImmTy(ImmTyOffen); }
bool isIdxen() const { return isImmTy(ImmTyIdxen); }
bool isAddr64() const { return isImmTy(ImmTyAddr64); }
@@ -234,7 +294,10 @@ public:
bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); }
bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); }
bool isSDWADstUnused() const { return isImmTy(ImmTySdwaDstUnused); }
-
+ bool isInterpSlot() const { return isImmTy(ImmTyInterpSlot); }
+ bool isInterpAttr() const { return isImmTy(ImmTyInterpAttr); }
+ bool isAttrChan() const { return isImmTy(ImmTyAttrChan); }
+
bool isMod() const {
return isClampSI() || isOModSI();
}
@@ -243,47 +306,116 @@ public:
return isReg() || isImm();
}
- bool isRegClass(unsigned RCID) const {
- return isReg() && Reg.TRI->getRegClass(RCID).contains(getReg());
+ bool isRegClass(unsigned RCID) const;
+
+ bool isRegOrInlineNoMods(unsigned RCID, MVT type) const {
+ return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers();
}
- bool isSCSrc32() const {
- return isInlinableImm() || isRegClass(AMDGPU::SReg_32RegClassID);
+ bool isSCSrcB16() const {
+ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16);
}
- bool isSCSrc64() const {
- return isInlinableImm() || isRegClass(AMDGPU::SReg_64RegClassID);
+ bool isSCSrcB32() const {
+ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32);
}
- bool isSSrc32() const {
- return isImm() || isSCSrc32() || isExpr();
+ bool isSCSrcB64() const {
+ return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::i64);
}
- bool isSSrc64() const {
+ bool isSCSrcF16() const {
+ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16);
+ }
+
+ bool isSCSrcF32() const {
+ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32);
+ }
+
+ bool isSCSrcF64() const {
+ return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::f64);
+ }
+
+ bool isSSrcB32() const {
+ return isSCSrcB32() || isLiteralImm(MVT::i32) || isExpr();
+ }
+
+ bool isSSrcB16() const {
+ return isSCSrcB16() || isLiteralImm(MVT::i16);
+ }
+
+ bool isSSrcB64() const {
// TODO: Find out how SALU supports extension of 32-bit literals to 64 bits.
// See isVSrc64().
- return isImm() || isSCSrc64();
+ return isSCSrcB64() || isLiteralImm(MVT::i64);
+ }
+
+ bool isSSrcF32() const {
+ return isSCSrcB32() || isLiteralImm(MVT::f32) || isExpr();
+ }
+
+ bool isSSrcF64() const {
+ return isSCSrcB64() || isLiteralImm(MVT::f64);
+ }
+
+ bool isSSrcF16() const {
+ return isSCSrcB16() || isLiteralImm(MVT::f16);
+ }
+
+ bool isVCSrcB32() const {
+ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
+ }
+
+ bool isVCSrcB64() const {
+ return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64);
+ }
+
+ bool isVCSrcB16() const {
+ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16);
+ }
+
+ bool isVCSrcF32() const {
+ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32);
+ }
+
+ bool isVCSrcF64() const {
+ return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64);
+ }
+
+ bool isVCSrcF16() const {
+ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16);
+ }
+
+ bool isVSrcB32() const {
+ return isVCSrcF32() || isLiteralImm(MVT::i32);
+ }
+
+ bool isVSrcB64() const {
+ return isVCSrcF64() || isLiteralImm(MVT::i64);
}
- bool isVCSrc32() const {
- return isInlinableImm() || isRegClass(AMDGPU::VS_32RegClassID);
+ bool isVSrcB16() const {
+ return isVCSrcF16() || isLiteralImm(MVT::i16);
}
- bool isVCSrc64() const {
- return isInlinableImm() || isRegClass(AMDGPU::VS_64RegClassID);
+ bool isVSrcF32() const {
+ return isVCSrcF32() || isLiteralImm(MVT::f32);
}
- bool isVSrc32() const {
- return isImm() || isVCSrc32();
+ bool isVSrcF64() const {
+ return isVCSrcF64() || isLiteralImm(MVT::f64);
}
- bool isVSrc64() const {
- // TODO: Check if the 64-bit value (coming from assembly source) can be
- // narrowed to 32 bits (in the instruction stream). That require knowledge
- // of instruction type (unsigned/signed, floating or "untyped"/B64),
- // see [AMD GCN3 ISA 6.3.1].
- // TODO: How 64-bit values are formed from 32-bit literals in _B64 insns?
- return isImm() || isVCSrc64();
+ bool isVSrcF16() const {
+ return isVCSrcF16() || isLiteralImm(MVT::f16);
+ }
+
+ bool isKImmFP32() const {
+ return isLiteralImm(MVT::f32);
+ }
+
+ bool isKImmFP16() const {
+ return isLiteralImm(MVT::f16);
}
bool isMem() const override {
@@ -301,9 +433,11 @@ public:
bool isSWaitCnt() const;
bool isHwreg() const;
bool isSendMsg() const;
- bool isSMRDOffset() const;
+ bool isSMRDOffset8() const;
+ bool isSMRDOffset20() const;
bool isSMRDLiteralOffset() const;
bool isDPPCtrl() const;
+ bool isGPRIdxMode() const;
StringRef getExpressionAsToken() const {
assert(isExpr());
@@ -311,7 +445,6 @@ public:
return S->getSymbol().getName();
}
-
StringRef getToken() const {
assert(isToken());
@@ -359,7 +492,7 @@ public:
bool hasModifiers() const {
return getModifiers().hasModifiers();
}
-
+
bool hasFPModifiers() const {
return getModifiers().hasFPModifiers();
}
@@ -368,30 +501,23 @@ public:
return getModifiers().hasIntModifiers();
}
- void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const {
- if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers()) {
- // Apply modifiers to immediate value
- int64_t Val = Imm.Val;
- bool Negate = Imm.Mods.Neg; // Only negate can get here
- if (Imm.IsFPImm) {
- APFloat F(BitsToFloat(Val));
- if (Negate) {
- F.changeSign();
- }
- Val = F.bitcastToAPInt().getZExtValue();
- } else {
- Val = Negate ? -Val : Val;
- }
- Inst.addOperand(MCOperand::createImm(Val));
- } else {
- Inst.addOperand(MCOperand::createImm(getImm()));
- }
+ void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const;
+
+ void addLiteralImmOperand(MCInst &Inst, int64_t Val) const;
+
+ template <unsigned Bitwidth>
+ void addKImmFPOperands(MCInst &Inst, unsigned N) const;
+
+ void addKImmFP16Operands(MCInst &Inst, unsigned N) const {
+ addKImmFPOperands<16>(Inst, N);
}
- void addRegOperands(MCInst &Inst, unsigned N) const {
- Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI)));
+ void addKImmFP32Operands(MCInst &Inst, unsigned N) const {
+ addKImmFPOperands<32>(Inst, N);
}
+ void addRegOperands(MCInst &Inst, unsigned N) const;
+
void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
if (isRegKind())
addRegOperands(Inst, N);
@@ -421,6 +547,23 @@ public:
addRegOrImmWithInputModsOperands(Inst, N);
}
+ void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const {
+ Modifiers Mods = getModifiers();
+ Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand()));
+ assert(isRegKind());
+ addRegOperands(Inst, N);
+ }
+
+ void addRegWithFPInputModsOperands(MCInst &Inst, unsigned N) const {
+ assert(!hasIntModifiers());
+ addRegWithInputModsOperands(Inst, N);
+ }
+
+ void addRegWithIntInputModsOperands(MCInst &Inst, unsigned N) const {
+ assert(!hasFPModifiers());
+ addRegWithInputModsOperands(Inst, N);
+ }
+
void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
if (isImm())
addImmOperands(Inst, N);
@@ -430,7 +573,7 @@ public:
}
}
- void printImmTy(raw_ostream& OS, ImmTy Type) const {
+ static void printImmTy(raw_ostream& OS, ImmTy Type) {
switch (Type) {
case ImmTyNone: OS << "None"; break;
case ImmTyGDS: OS << "GDS"; break;
@@ -458,8 +601,15 @@ public:
case ImmTyDA: OS << "DA"; break;
case ImmTyR128: OS << "R128"; break;
case ImmTyLWE: OS << "LWE"; break;
+ case ImmTyOff: OS << "Off"; break;
+ case ImmTyExpTgt: OS << "ExpTgt"; break;
+ case ImmTyExpCompr: OS << "ExpCompr"; break;
+ case ImmTyExpVM: OS << "ExpVM"; break;
case ImmTyHwreg: OS << "Hwreg"; break;
case ImmTySendMsg: OS << "SendMsg"; break;
+ case ImmTyInterpSlot: OS << "InterpSlot"; break;
+ case ImmTyInterpAttr: OS << "InterpAttr"; break;
+ case ImmTyAttrChan: OS << "AttrChan"; break;
}
}
@@ -484,22 +634,24 @@ public:
}
}
- static AMDGPUOperand::Ptr CreateImm(int64_t Val, SMLoc Loc,
+ static AMDGPUOperand::Ptr CreateImm(const AMDGPUAsmParser *AsmParser,
+ int64_t Val, SMLoc Loc,
enum ImmTy Type = ImmTyNone,
bool IsFPImm = false) {
- auto Op = llvm::make_unique<AMDGPUOperand>(Immediate);
+ auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser);
Op->Imm.Val = Val;
Op->Imm.IsFPImm = IsFPImm;
Op->Imm.Type = Type;
- Op->Imm.Mods = {false, false, false};
+ Op->Imm.Mods = Modifiers();
Op->StartLoc = Loc;
Op->EndLoc = Loc;
return Op;
}
- static AMDGPUOperand::Ptr CreateToken(StringRef Str, SMLoc Loc,
+ static AMDGPUOperand::Ptr CreateToken(const AMDGPUAsmParser *AsmParser,
+ StringRef Str, SMLoc Loc,
bool HasExplicitEncodingSize = true) {
- auto Res = llvm::make_unique<AMDGPUOperand>(Token);
+ auto Res = llvm::make_unique<AMDGPUOperand>(Token, AsmParser);
Res->Tok.Data = Str.data();
Res->Tok.Length = Str.size();
Res->StartLoc = Loc;
@@ -507,24 +659,22 @@ public:
return Res;
}
- static AMDGPUOperand::Ptr CreateReg(unsigned RegNo, SMLoc S,
+ static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser,
+ unsigned RegNo, SMLoc S,
SMLoc E,
- const MCRegisterInfo *TRI,
- const MCSubtargetInfo *STI,
bool ForceVOP3) {
- auto Op = llvm::make_unique<AMDGPUOperand>(Register);
+ auto Op = llvm::make_unique<AMDGPUOperand>(Register, AsmParser);
Op->Reg.RegNo = RegNo;
- Op->Reg.TRI = TRI;
- Op->Reg.STI = STI;
- Op->Reg.Mods = {false, false, false};
+ Op->Reg.Mods = Modifiers();
Op->Reg.IsForcedVOP3 = ForceVOP3;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
}
- static AMDGPUOperand::Ptr CreateExpr(const class MCExpr *Expr, SMLoc S) {
- auto Op = llvm::make_unique<AMDGPUOperand>(Expression);
+ static AMDGPUOperand::Ptr CreateExpr(const AMDGPUAsmParser *AsmParser,
+ const class MCExpr *Expr, SMLoc S) {
+ auto Op = llvm::make_unique<AMDGPUOperand>(Expression, AsmParser);
Op->Expr = Expr;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -537,6 +687,53 @@ raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) {
return OS;
}
+//===----------------------------------------------------------------------===//
+// AsmParser
+//===----------------------------------------------------------------------===//
+
+// Holds info related to the current kernel, e.g. count of SGPRs used.
+// Kernel scope begins at .amdgpu_hsa_kernel directive, ends at next
+// .amdgpu_hsa_kernel or at EOF.
+class KernelScopeInfo {
+ int SgprIndexUnusedMin;
+ int VgprIndexUnusedMin;
+ MCContext *Ctx;
+
+ void usesSgprAt(int i) {
+ if (i >= SgprIndexUnusedMin) {
+ SgprIndexUnusedMin = ++i;
+ if (Ctx) {
+ MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.sgpr_count"));
+ Sym->setVariableValue(MCConstantExpr::create(SgprIndexUnusedMin, *Ctx));
+ }
+ }
+ }
+ void usesVgprAt(int i) {
+ if (i >= VgprIndexUnusedMin) {
+ VgprIndexUnusedMin = ++i;
+ if (Ctx) {
+ MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
+ Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx));
+ }
+ }
+ }
+public:
+ KernelScopeInfo() : SgprIndexUnusedMin(-1), VgprIndexUnusedMin(-1), Ctx(nullptr)
+ {}
+ void initialize(MCContext &Context) {
+ Ctx = &Context;
+ usesSgprAt(SgprIndexUnusedMin = -1);
+ usesVgprAt(VgprIndexUnusedMin = -1);
+ }
+ void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
+ switch (RegKind) {
+ case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
+ case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break;
+ default: break;
+ }
+ }
+};
+
class AMDGPUAsmParser : public MCTargetAsmParser {
const MCInstrInfo &MII;
MCAsmParser &Parser;
@@ -544,22 +741,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
unsigned ForcedEncodingSize;
bool ForcedDPP;
bool ForcedSDWA;
-
- bool isSI() const {
- return AMDGPU::isSI(getSTI());
- }
-
- bool isCI() const {
- return AMDGPU::isCI(getSTI());
- }
-
- bool isVI() const {
- return AMDGPU::isVI(getSTI());
- }
-
- bool hasSGPR102_SGPR103() const {
- return !isVI();
- }
+ KernelScopeInfo KernelScope;
/// @name Auto-generated Match Functions
/// {
@@ -570,9 +752,11 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
/// }
private:
+ bool ParseAsAbsoluteExpression(uint32_t &Ret);
bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
bool ParseDirectiveHSACodeObjectVersion();
bool ParseDirectiveHSACodeObjectISA();
+ bool ParseDirectiveRuntimeMetadata();
bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
bool ParseDirectiveAMDKernelCodeT();
bool ParseSectionDirectiveHSAText();
@@ -584,7 +768,7 @@ private:
bool ParseSectionDirectiveHSADataGlobalProgram();
bool ParseSectionDirectiveHSARodataReadonlyAgent();
bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum);
- bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth);
+ bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex);
void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn);
public:
@@ -622,6 +806,27 @@ public:
Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx));
}
+ KernelScope.initialize(getContext());
+ }
+
+ bool isSI() const {
+ return AMDGPU::isSI(getSTI());
+ }
+
+ bool isCI() const {
+ return AMDGPU::isCI(getSTI());
+ }
+
+ bool isVI() const {
+ return AMDGPU::isVI(getSTI());
+ }
+
+ bool hasInv2PiInlineImm() const {
+ return getSTI().getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
+ }
+
+ bool hasSGPR102_SGPR103() const {
+ return !isVI();
}
AMDGPUTargetStreamer &getTargetStreamer() {
@@ -629,6 +834,16 @@ public:
return static_cast<AMDGPUTargetStreamer &>(TS);
}
+ const MCRegisterInfo *getMRI() const {
+ // We need this const_cast because for some reason getContext() is not const
+ // in MCAsmParser.
+ return const_cast<AMDGPUAsmParser*>(this)->getContext().getRegisterInfo();
+ }
+
+ const MCInstrInfo *getMII() const {
+ return &MII;
+ }
+
void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; }
void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; }
void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; }
@@ -637,6 +852,7 @@ public:
bool isForcedVOP3() const { return ForcedEncodingSize == 64; }
bool isForcedDPP() const { return ForcedDPP; }
bool isForcedSDWA() const { return ForcedSDWA; }
+ ArrayRef<unsigned> getMatchedVariants() const;
std::unique_ptr<AMDGPUOperand> parseRegister();
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
@@ -652,23 +868,31 @@ public:
StringRef parseMnemonicSuffix(StringRef Name);
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) override;
+ //bool ProcessInstruction(MCInst &Inst);
OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int);
- OperandMatchResultTy parseIntWithPrefix(const char *Prefix,
- OperandVector &Operands,
- enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
- bool (*ConvertResult)(int64_t&) = 0);
- OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands,
- enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
- OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, StringRef &Value);
+ OperandMatchResultTy
+ parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
+ enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+ bool (*ConvertResult)(int64_t &) = nullptr);
+ OperandMatchResultTy
+ parseNamedBit(const char *Name, OperandVector &Operands,
+ enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
+ OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
+ StringRef &Value);
OperandMatchResultTy parseImm(OperandVector &Operands);
+ OperandMatchResultTy parseReg(OperandVector &Operands);
OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
- OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands);
- OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands);
+ OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true);
+ OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true);
+ OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
+ OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands);
+ OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
void cvtDS(MCInst &Inst, const OperandVector &Operands);
+ void cvtExp(MCInst &Inst, const OperandVector &Operands);
bool parseCnt(int64_t &IntVal);
OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
@@ -683,10 +907,17 @@ private:
bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId);
bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width);
+
+ void errorExpTgt();
+ OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
+
public:
OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
+ OperandMatchResultTy parseExpTgt(OperandVector &Operands);
OperandMatchResultTy parseSendMsgOp(OperandVector &Operands);
+ OperandMatchResultTy parseInterpSlot(OperandVector &Operands);
+ OperandMatchResultTy parseInterpAttr(OperandVector &Operands);
OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
@@ -701,7 +932,8 @@ public:
AMDGPUOperand::Ptr defaultDA() const;
AMDGPUOperand::Ptr defaultR128() const;
AMDGPUOperand::Ptr defaultLWE() const;
- AMDGPUOperand::Ptr defaultSMRDOffset() const;
+ AMDGPUOperand::Ptr defaultSMRDOffset8() const;
+ AMDGPUOperand::Ptr defaultSMRDOffset20() const;
AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
OperandMatchResultTy parseOModOperand(OperandVector &Operands);
@@ -736,8 +968,274 @@ struct OptionalOperand {
bool (*ConvertResult)(int64_t&);
};
+} // end anonymous namespace
+
+// May be called with integer type with equivalent bitwidth.
+static const fltSemantics *getFltSemantics(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return &APFloat::IEEEsingle();
+ case 8:
+ return &APFloat::IEEEdouble();
+ case 2:
+ return &APFloat::IEEEhalf();
+ default:
+ llvm_unreachable("unsupported fp type");
+ }
+}
+
+static const fltSemantics *getFltSemantics(MVT VT) {
+ return getFltSemantics(VT.getSizeInBits() / 8);
+}
+
+//===----------------------------------------------------------------------===//
+// Operand
+//===----------------------------------------------------------------------===//
+
+static bool canLosslesslyConvertToFPType(APFloat &FPLiteral, MVT VT) {
+ bool Lost;
+
+ // Convert literal to single precision
+ APFloat::opStatus Status = FPLiteral.convert(*getFltSemantics(VT),
+ APFloat::rmNearestTiesToEven,
+ &Lost);
+ // We allow precision lost but not overflow or underflow
+ if (Status != APFloat::opOK &&
+ Lost &&
+ ((Status & APFloat::opOverflow) != 0 ||
+ (Status & APFloat::opUnderflow) != 0)) {
+ return false;
+ }
+
+ return true;
+}
+
+bool AMDGPUOperand::isInlinableImm(MVT type) const {
+ if (!isImmTy(ImmTyNone)) {
+ // Only plain immediates are inlinable (e.g. "clamp" attribute is not)
+ return false;
+ }
+ // TODO: We should avoid using host float here. It would be better to
+ // check the float bit values which is what a few other places do.
+ // We've had bot failures before due to weird NaN support on mips hosts.
+
+ APInt Literal(64, Imm.Val);
+
+ if (Imm.IsFPImm) { // We got fp literal token
+ if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand
+ return AMDGPU::isInlinableLiteral64(Imm.Val,
+ AsmParser->hasInv2PiInlineImm());
+ }
+
+ APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
+ if (!canLosslesslyConvertToFPType(FPLiteral, type))
+ return false;
+
+ // Check if single precision literal is inlinable
+ return AMDGPU::isInlinableLiteral32(
+ static_cast<int32_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
+ AsmParser->hasInv2PiInlineImm());
+ }
+
+
+ // We got int literal token.
+ if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand
+ return AMDGPU::isInlinableLiteral64(Imm.Val,
+ AsmParser->hasInv2PiInlineImm());
+ }
+
+ if (type.getScalarSizeInBits() == 16) {
+ return AMDGPU::isInlinableLiteral16(
+ static_cast<int16_t>(Literal.getLoBits(16).getSExtValue()),
+ AsmParser->hasInv2PiInlineImm());
+ }
+
+ return AMDGPU::isInlinableLiteral32(
+ static_cast<int32_t>(Literal.getLoBits(32).getZExtValue()),
+ AsmParser->hasInv2PiInlineImm());
+}
+
+bool AMDGPUOperand::isLiteralImm(MVT type) const {
+ // Check that this imediate can be added as literal
+ if (!isImmTy(ImmTyNone)) {
+ return false;
+ }
+
+ if (!Imm.IsFPImm) {
+ // We got int literal token.
+
+ unsigned Size = type.getSizeInBits();
+ if (Size == 64)
+ Size = 32;
+
+ // FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP
+ // types.
+ return isUIntN(Size, Imm.Val) || isIntN(Size, Imm.Val);
+ }
+
+ // We got fp literal token
+ if (type == MVT::f64) { // Expected 64-bit fp operand
+ // We would set low 64-bits of literal to zeroes but we accept this literals
+ return true;
+ }
+
+ if (type == MVT::i64) { // Expected 64-bit int operand
+ // We don't allow fp literals in 64-bit integer instructions. It is
+ // unclear how we should encode them.
+ return false;
+ }
+
+ APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
+ return canLosslesslyConvertToFPType(FPLiteral, type);
+}
+
+bool AMDGPUOperand::isRegClass(unsigned RCID) const {
+ return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
}
+void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
+ int64_t Val = Imm.Val;
+ if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers() && Imm.Mods.Neg) {
+ // Apply modifiers to immediate value. Only negate can get here
+ if (Imm.IsFPImm) {
+ APFloat F(BitsToDouble(Val));
+ F.changeSign();
+ Val = F.bitcastToAPInt().getZExtValue();
+ } else {
+ Val = -Val;
+ }
+ }
+
+ if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()),
+ Inst.getNumOperands())) {
+ addLiteralImmOperand(Inst, Val);
+ } else {
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+}
+
+void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
+ const auto& InstDesc = AsmParser->getMII()->get(Inst.getOpcode());
+ auto OpNum = Inst.getNumOperands();
+ // Check that this operand accepts literals
+ assert(AMDGPU::isSISrcOperand(InstDesc, OpNum));
+
+ auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size
+
+ if (Imm.IsFPImm) { // We got fp literal token
+ APInt Literal(64, Val);
+
+ switch (OpSize) {
+ case 8: {
+ if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
+ AsmParser->hasInv2PiInlineImm())) {
+ Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
+ return;
+ }
+
+ // Non-inlineable
+ if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand
+ // For fp operands we check if low 32 bits are zeros
+ if (Literal.getLoBits(32) != 0) {
+ const_cast<AMDGPUAsmParser *>(AsmParser)->Warning(Inst.getLoc(),
+ "Can't encode literal as exact 64-bit floating-point operand. "
+ "Low 32-bits will be set to zero");
+ }
+
+ Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue()));
+ return;
+ }
+
+ // We don't allow fp literals in 64-bit integer instructions. It is
+ // unclear how we should encode them. This case should be checked earlier
+ // in predicate methods (isLiteralImm())
+ llvm_unreachable("fp literal in 64-bit integer instruction.");
+ }
+ case 4:
+ case 2: {
+ bool lost;
+ APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
+ // Convert literal to single precision
+ FPLiteral.convert(*getFltSemantics(OpSize),
+ APFloat::rmNearestTiesToEven, &lost);
+ // We allow precision lost but not overflow or underflow. This should be
+ // checked earlier in isLiteralImm()
+ Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
+ return;
+ }
+ default:
+ llvm_unreachable("invalid operand size");
+ }
+
+ return;
+ }
+
+ // We got int literal token.
+ // Only sign extend inline immediates.
+ // FIXME: No errors on truncation
+ switch (OpSize) {
+ case 4: {
+ if (isInt<32>(Val) &&
+ AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
+ AsmParser->hasInv2PiInlineImm())) {
+ Inst.addOperand(MCOperand::createImm(Val));
+ return;
+ }
+
+ Inst.addOperand(MCOperand::createImm(Val & 0xffffffff));
+ return;
+ }
+ case 8: {
+ if (AMDGPU::isInlinableLiteral64(Val,
+ AsmParser->hasInv2PiInlineImm())) {
+ Inst.addOperand(MCOperand::createImm(Val));
+ return;
+ }
+
+ Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
+ return;
+ }
+ case 2: {
+ if (isInt<16>(Val) &&
+ AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
+ AsmParser->hasInv2PiInlineImm())) {
+ Inst.addOperand(MCOperand::createImm(Val));
+ return;
+ }
+
+ Inst.addOperand(MCOperand::createImm(Val & 0xffff));
+ return;
+ }
+ default:
+ llvm_unreachable("invalid operand size");
+ }
+}
+
+template <unsigned Bitwidth>
+void AMDGPUOperand::addKImmFPOperands(MCInst &Inst, unsigned N) const {
+ APInt Literal(64, Imm.Val);
+
+ if (!Imm.IsFPImm) {
+ // We got int literal token.
+ Inst.addOperand(MCOperand::createImm(Literal.getLoBits(Bitwidth).getZExtValue()));
+ return;
+ }
+
+ bool Lost;
+ APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
+ FPLiteral.convert(*getFltSemantics(Bitwidth / 8),
+ APFloat::rmNearestTiesToEven, &Lost);
+ Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
+}
+
+void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const {
+ Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), AsmParser->getSTI())));
+}
+
+//===----------------------------------------------------------------------===//
+// AsmParser
+//===----------------------------------------------------------------------===//
+
static int getRegClass(RegisterKind Is, unsigned RegWidth) {
if (Is == IS_VGPR) {
switch (RegWidth) {
@@ -818,12 +1316,13 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, R
RegWidth++;
return true;
default:
- assert(false); return false;
+ llvm_unreachable("unexpected register kind");
}
}
-bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth)
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex)
{
+ if (DwordRegIndex) { *DwordRegIndex = 0; }
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
if (getLexer().is(AsmToken::Identifier)) {
StringRef RegName = Parser.getTok().getString();
@@ -883,7 +1382,7 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
} else if (getLexer().is(AsmToken::LBrac)) {
// List of consecutive registers: [s0,s1,s2,s3]
Parser.Lex();
- if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth))
+ if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, nullptr))
return false;
if (RegWidth != 1)
return false;
@@ -895,7 +1394,7 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
} else if (getLexer().is(AsmToken::RBrac)) {
Parser.Lex();
break;
- } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1)) {
+ } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1, nullptr)) {
if (RegWidth1 != 1) {
return false;
}
@@ -923,11 +1422,12 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
{
unsigned Size = 1;
if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
- // SGPR and TTMP registers must be are aligned. Max required alignment is 4 dwords.
+ // SGPR and TTMP registers must be aligned. Max required alignment is 4 dwords.
Size = std::min(RegWidth, 4u);
}
if (RegNum % Size != 0)
return false;
+ if (DwordRegIndex) { *DwordRegIndex = RegNum; }
RegNum = RegNum / Size;
int RCID = getRegClass(RegKind, RegWidth);
if (RCID == -1)
@@ -940,7 +1440,7 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
}
default:
- assert(false); return false;
+ llvm_unreachable("unexpected register kind");
}
if (!subtargetHasRegister(*TRI, Reg))
@@ -952,20 +1452,19 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
const auto &Tok = Parser.getTok();
SMLoc StartLoc = Tok.getLoc();
SMLoc EndLoc = Tok.getEndLoc();
- const MCRegisterInfo *TRI = getContext().getRegisterInfo();
-
RegisterKind RegKind;
- unsigned Reg, RegNum, RegWidth;
+ unsigned Reg, RegNum, RegWidth, DwordRegIndex;
- if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
+ if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) {
return nullptr;
}
- return AMDGPUOperand::CreateReg(Reg, StartLoc, EndLoc,
- TRI, &getSTI(), false);
+ KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
+ return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false);
}
-AMDGPUAsmParser::OperandMatchResultTy
+OperandMatchResultTy
AMDGPUAsmParser::parseImm(OperandVector &Operands) {
+ // TODO: add syntactic sugar for 1/(2*PI)
bool Minus = false;
if (getLexer().getKind() == AsmToken::Minus) {
Minus = true;
@@ -978,28 +1477,21 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) {
int64_t IntVal;
if (getParser().parseAbsoluteExpression(IntVal))
return MatchOperand_ParseFail;
- if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) {
- Error(S, "invalid immediate: only 32-bit values are legal");
- return MatchOperand_ParseFail;
- }
-
if (Minus)
IntVal *= -1;
- Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S));
+ Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S));
return MatchOperand_Success;
}
case AsmToken::Real: {
- // FIXME: We should emit an error if a double precisions floating-point
- // value is used. I'm not sure the best way to detect this.
int64_t IntVal;
if (getParser().parseAbsoluteExpression(IntVal))
return MatchOperand_ParseFail;
- APFloat F((float)BitsToDouble(IntVal));
+ APFloat F(BitsToDouble(IntVal));
if (Minus)
F.changeSign();
Operands.push_back(
- AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S,
+ AMDGPUOperand::CreateImm(this, F.bitcastToAPInt().getZExtValue(), S,
AMDGPUOperand::ImmTyNone, true));
return MatchOperand_Success;
}
@@ -1008,24 +1500,29 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) {
}
}
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
- auto res = parseImm(Operands);
- if (res != MatchOperand_NoMatch) {
- return res;
- }
-
+OperandMatchResultTy
+AMDGPUAsmParser::parseReg(OperandVector &Operands) {
if (auto R = parseRegister()) {
assert(R->isReg());
R->Reg.IsForcedVOP3 = isForcedVOP3();
Operands.push_back(std::move(R));
return MatchOperand_Success;
}
- return MatchOperand_ParseFail;
+ return MatchOperand_NoMatch;
}
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
+ auto res = parseImm(Operands);
+ if (res != MatchOperand_NoMatch) {
+ return res;
+ }
+
+ return parseReg(Operands);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) {
// XXX: During parsing we can't determine if minus sign means
// negate-modifier or negative immediate value.
// By default we suppose it is modifier.
@@ -1055,12 +1552,17 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
Abs = true;
}
- auto Res = parseRegOrImm(Operands);
+ OperandMatchResultTy Res;
+ if (AllowImm) {
+ Res = parseRegOrImm(Operands);
+ } else {
+ Res = parseReg(Operands);
+ }
if (Res != MatchOperand_Success) {
return Res;
}
- AMDGPUOperand::Modifiers Mods = {false, false, false};
+ AMDGPUOperand::Modifiers Mods;
if (Negate) {
Mods.Neg = true;
}
@@ -1088,8 +1590,8 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
return MatchOperand_Success;
}
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) {
bool Sext = false;
if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") {
@@ -1102,12 +1604,17 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
Parser.Lex();
}
- auto Res = parseRegOrImm(Operands);
+ OperandMatchResultTy Res;
+ if (AllowImm) {
+ Res = parseRegOrImm(Operands);
+ } else {
+ Res = parseReg(Operands);
+ }
if (Res != MatchOperand_Success) {
return Res;
}
- AMDGPUOperand::Modifiers Mods = {false, false, false};
+ AMDGPUOperand::Modifiers Mods;
if (Sext) {
if (getLexer().isNot(AsmToken::RParen)) {
Error(Parser.getTok().getLoc(), "expected closing parentheses");
@@ -1116,14 +1623,43 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
Parser.Lex();
Mods.Sext = true;
}
-
+
if (Mods.hasIntModifiers()) {
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
Op.setModifiers(Mods);
}
+
return MatchOperand_Success;
}
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegWithFPInputMods(OperandVector &Operands) {
+ return parseRegOrImmWithFPInputMods(Operands, false);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) {
+ return parseRegOrImmWithIntInputMods(Operands, false);
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) {
+ std::unique_ptr<AMDGPUOperand> Reg = parseRegister();
+ if (Reg) {
+ Operands.push_back(std::move(Reg));
+ return MatchOperand_Success;
+ }
+
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.getString() == "off") {
+ Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Tok.getLoc(),
+ AMDGPUOperand::ImmTyOff, false));
+ Parser.Lex();
+ return MatchOperand_Success;
+ }
+
+ return MatchOperand_NoMatch;
+}
+
unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
@@ -1139,65 +1675,137 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
getForcedEncodingSize() != 64)
return Match_PreferE32;
+ if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa_vi ||
+ Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) {
+ // v_mac_f32/16 allow only dst_sel == DWORD;
+ auto OpNum =
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::dst_sel);
+ const auto &Op = Inst.getOperand(OpNum);
+ if (!Op.isImm() || Op.getImm() != AMDGPU::SDWA::SdwaSel::DWORD) {
+ return Match_InvalidOperand;
+ }
+ }
+
return Match_Success;
}
+// What asm variants we should check
+ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
+ if (getForcedEncodingSize() == 32) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::DEFAULT};
+ return makeArrayRef(Variants);
+ }
+
+ if (isForcedVOP3()) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::VOP3};
+ return makeArrayRef(Variants);
+ }
+
+ if (isForcedSDWA()) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA};
+ return makeArrayRef(Variants);
+ }
+
+ if (isForcedDPP()) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::DPP};
+ return makeArrayRef(Variants);
+ }
+
+ static const unsigned Variants[] = {
+ AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
+ AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP
+ };
+
+ return makeArrayRef(Variants);
+}
+
bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) {
MCInst Inst;
+ unsigned Result = Match_Success;
+ for (auto Variant : getMatchedVariants()) {
+ uint64_t EI;
+ auto R = MatchInstructionImpl(Operands, Inst, EI, MatchingInlineAsm,
+ Variant);
+ // We order match statuses from least to most specific. We use most specific
+ // status as resulting
+ // Match_MnemonicFail < Match_InvalidOperand < Match_MissingFeature < Match_PreferE32
+ if ((R == Match_Success) ||
+ (R == Match_PreferE32) ||
+ (R == Match_MissingFeature && Result != Match_PreferE32) ||
+ (R == Match_InvalidOperand && Result != Match_MissingFeature
+ && Result != Match_PreferE32) ||
+ (R == Match_MnemonicFail && Result != Match_InvalidOperand
+ && Result != Match_MissingFeature
+ && Result != Match_PreferE32)) {
+ Result = R;
+ ErrorInfo = EI;
+ }
+ if (R == Match_Success)
+ break;
+ }
- switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
- default: break;
- case Match_Success:
- Inst.setLoc(IDLoc);
- Out.EmitInstruction(Inst, getSTI());
- return false;
- case Match_MissingFeature:
- return Error(IDLoc, "instruction not supported on this GPU");
+ switch (Result) {
+ default: break;
+ case Match_Success:
+ Inst.setLoc(IDLoc);
+ Out.EmitInstruction(Inst, getSTI());
+ return false;
- case Match_MnemonicFail:
- return Error(IDLoc, "unrecognized instruction mnemonic");
+ case Match_MissingFeature:
+ return Error(IDLoc, "instruction not supported on this GPU");
- case Match_InvalidOperand: {
- SMLoc ErrorLoc = IDLoc;
- if (ErrorInfo != ~0ULL) {
- if (ErrorInfo >= Operands.size()) {
- return Error(IDLoc, "too few operands for instruction");
- }
- ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc();
- if (ErrorLoc == SMLoc())
- ErrorLoc = IDLoc;
+ case Match_MnemonicFail:
+ return Error(IDLoc, "unrecognized instruction mnemonic");
+
+ case Match_InvalidOperand: {
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size()) {
+ return Error(IDLoc, "too few operands for instruction");
}
- return Error(ErrorLoc, "invalid operand for instruction");
+ ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
}
- case Match_PreferE32:
- return Error(IDLoc, "internal error: instruction without _e64 suffix "
- "should be encoded as e32");
+ return Error(ErrorLoc, "invalid operand for instruction");
+ }
+
+ case Match_PreferE32:
+ return Error(IDLoc, "internal error: instruction without _e64 suffix "
+ "should be encoded as e32");
}
llvm_unreachable("Implement any new match types added!");
}
+bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) {
+ int64_t Tmp = -1;
+ if (getLexer().isNot(AsmToken::Integer) && getLexer().isNot(AsmToken::Identifier)) {
+ return true;
+ }
+ if (getParser().parseAbsoluteExpression(Tmp)) {
+ return true;
+ }
+ Ret = static_cast<uint32_t>(Tmp);
+ return false;
+}
+
+
bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
uint32_t &Minor) {
- if (getLexer().isNot(AsmToken::Integer))
+ if (ParseAsAbsoluteExpression(Major))
return TokError("invalid major version");
- Major = getLexer().getTok().getIntVal();
- Lex();
-
if (getLexer().isNot(AsmToken::Comma))
return TokError("minor version number required, comma expected");
Lex();
- if (getLexer().isNot(AsmToken::Integer))
+ if (ParseAsAbsoluteExpression(Minor))
return TokError("invalid minor version");
- Minor = getLexer().getTok().getIntVal();
- Lex();
-
return false;
}
@@ -1214,7 +1822,6 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() {
}
bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
-
uint32_t Major;
uint32_t Minor;
uint32_t Stepping;
@@ -1231,7 +1838,6 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
return false;
}
-
if (ParseDirectiveMajorMinor(Major, Minor))
return true;
@@ -1239,12 +1845,9 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
return TokError("stepping version number required, comma expected");
Lex();
- if (getLexer().isNot(AsmToken::Integer))
+ if (ParseAsAbsoluteExpression(Stepping))
return TokError("invalid stepping version");
- Stepping = getLexer().getTok().getIntVal();
- Lex();
-
if (getLexer().isNot(AsmToken::Comma))
return TokError("vendor name required, comma expected");
Lex();
@@ -1270,6 +1873,46 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
return false;
}
+bool AMDGPUAsmParser::ParseDirectiveRuntimeMetadata() {
+ std::string Metadata;
+ raw_string_ostream MS(Metadata);
+
+ getLexer().setSkipSpace(false);
+
+ bool FoundEnd = false;
+ while (!getLexer().is(AsmToken::Eof)) {
+ while (getLexer().is(AsmToken::Space)) {
+ MS << ' ';
+ Lex();
+ }
+
+ if (getLexer().is(AsmToken::Identifier)) {
+ StringRef ID = getLexer().getTok().getIdentifier();
+ if (ID == ".end_amdgpu_runtime_metadata") {
+ Lex();
+ FoundEnd = true;
+ break;
+ }
+ }
+
+ MS << Parser.parseStringToEndOfStatement()
+ << getContext().getAsmInfo()->getSeparatorString();
+
+ Parser.eatToEndOfStatement();
+ }
+
+ getLexer().setSkipSpace(true);
+
+ if (getLexer().is(AsmToken::Eof) && !FoundEnd)
+ return TokError("expected directive .end_amdgpu_runtime_metadata not found");
+
+ MS.flush();
+
+ getTargetStreamer().EmitRuntimeMetadata(Metadata);
+
+ return false;
+}
+
bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
amd_kernel_code_t &Header) {
SmallString<40> ErrStr;
@@ -1282,12 +1925,10 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
}
bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
-
amd_kernel_code_t Header;
AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits());
while (true) {
-
// Lex EndOfStatement. This is in a while loop, because lexing a comment
// will set the current token to EndOfStatement.
while(getLexer().is(AsmToken::EndOfStatement))
@@ -1326,6 +1967,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
ELF::STT_AMDGPU_HSA_KERNEL);
Lex();
+ KernelScope.initialize(getContext());
return false;
}
@@ -1378,6 +2020,9 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
if (IDVal == ".hsa_code_object_isa")
return ParseDirectiveHSACodeObjectISA();
+ if (IDVal == ".amdgpu_runtime_metadata")
+ return ParseDirectiveRuntimeMetadata();
+
if (IDVal == ".amd_kernel_code_t")
return ParseDirectiveAMDKernelCodeT();
@@ -1433,7 +2078,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
return true;
}
-AMDGPUAsmParser::OperandMatchResultTy
+OperandMatchResultTy
AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
// Try to parse with a custom parser
@@ -1464,11 +2109,11 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
SMLoc S = Tok.getLoc();
const MCExpr *Expr = nullptr;
if (!Parser.parseExpression(Expr)) {
- Operands.push_back(AMDGPUOperand::CreateExpr(Expr, S));
+ Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
return MatchOperand_Success;
}
- Operands.push_back(AMDGPUOperand::CreateToken(Tok.getString(), Tok.getLoc()));
+ Operands.push_back(AMDGPUOperand::CreateToken(this, Tok.getString(), Tok.getLoc()));
Parser.Lex();
return MatchOperand_Success;
}
@@ -1502,10 +2147,10 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
SMLoc NameLoc, OperandVector &Operands) {
// Add the instruction mnemonic
Name = parseMnemonicSuffix(Name);
- Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc));
+ Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc));
while (!getLexer().is(AsmToken::EndOfStatement)) {
- AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name);
+ OperandMatchResultTy Res = parseOperand(Operands, Name);
// Eat the comma or space if there is one.
if (getLexer().is(AsmToken::Comma))
@@ -1535,7 +2180,7 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
// Utility functions
//===----------------------------------------------------------------------===//
-AMDGPUAsmParser::OperandMatchResultTy
+OperandMatchResultTy
AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
switch(getLexer().getKind()) {
default: return MatchOperand_NoMatch;
@@ -1561,15 +2206,14 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
return MatchOperand_Success;
}
-AMDGPUAsmParser::OperandMatchResultTy
+OperandMatchResultTy
AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
enum AMDGPUOperand::ImmTy ImmTy,
bool (*ConvertResult)(int64_t&)) {
-
SMLoc S = Parser.getTok().getLoc();
int64_t Value = 0;
- AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value);
+ OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value);
if (Res != MatchOperand_Success)
return Res;
@@ -1577,11 +2221,11 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
return MatchOperand_ParseFail;
}
- Operands.push_back(AMDGPUOperand::CreateImm(Value, S, ImmTy));
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Value, S, ImmTy));
return MatchOperand_Success;
}
-AMDGPUAsmParser::OperandMatchResultTy
+OperandMatchResultTy
AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
enum AMDGPUOperand::ImmTy ImmTy) {
int64_t Bit = 0;
@@ -1609,7 +2253,7 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
}
}
- Operands.push_back(AMDGPUOperand::CreateImm(Bit, S, ImmTy));
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy));
return MatchOperand_Success;
}
@@ -1627,7 +2271,7 @@ void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands,
}
}
-AMDGPUAsmParser::OperandMatchResultTy
+OperandMatchResultTy
AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) {
if (getLexer().isNot(AsmToken::Identifier)) {
return MatchOperand_NoMatch;
@@ -1657,7 +2301,6 @@ AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) {
void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
const OperandVector &Operands) {
-
OptionalImmIndexMap OptionalIdx;
for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
@@ -1681,7 +2324,6 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
}
void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
-
std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
bool GDSOnly = false;
@@ -1712,6 +2354,46 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
}
+void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
+ OptionalImmIndexMap OptionalIdx;
+
+ unsigned EnMask = 0;
+ int SrcIdx = 0;
+
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+ // Add the register arguments
+ if (Op.isReg()) {
+ EnMask |= (1 << SrcIdx);
+ Op.addRegOperands(Inst, 1);
+ ++SrcIdx;
+ continue;
+ }
+
+ if (Op.isOff()) {
+ ++SrcIdx;
+ Inst.addOperand(MCOperand::createReg(AMDGPU::NoRegister));
+ continue;
+ }
+
+ if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyExpTgt) {
+ Op.addImmOperands(Inst, 1);
+ continue;
+ }
+
+ if (Op.isToken() && Op.getToken() == "done")
+ continue;
+
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = i;
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpVM);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpCompr);
+
+ Inst.addOperand(MCOperand::createImm(EnMask));
+}
//===----------------------------------------------------------------------===//
// s_waitcnt
@@ -1739,52 +2421,41 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma))
Parser.Lex();
- int CntShift;
- int CntMask;
-
- if (CntName == "vmcnt") {
- CntMask = 0xf;
- CntShift = 0;
- } else if (CntName == "expcnt") {
- CntMask = 0x7;
- CntShift = 4;
- } else if (CntName == "lgkmcnt") {
- CntMask = 0xf;
- CntShift = 8;
- } else {
+ IsaVersion IV = getIsaVersion(getSTI().getFeatureBits());
+ if (CntName == "vmcnt")
+ IntVal = encodeVmcnt(IV, IntVal, CntVal);
+ else if (CntName == "expcnt")
+ IntVal = encodeExpcnt(IV, IntVal, CntVal);
+ else if (CntName == "lgkmcnt")
+ IntVal = encodeLgkmcnt(IV, IntVal, CntVal);
+ else
return true;
- }
- IntVal &= ~(CntMask << CntShift);
- IntVal |= (CntVal << CntShift);
return false;
}
-AMDGPUAsmParser::OperandMatchResultTy
+OperandMatchResultTy
AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
- // Disable all counters by default.
- // vmcnt [3:0]
- // expcnt [6:4]
- // lgkmcnt [11:8]
- int64_t CntVal = 0xf7f;
+ IsaVersion IV = getIsaVersion(getSTI().getFeatureBits());
+ int64_t Waitcnt = getWaitcntBitMask(IV);
SMLoc S = Parser.getTok().getLoc();
switch(getLexer().getKind()) {
default: return MatchOperand_ParseFail;
case AsmToken::Integer:
// The operand can be an integer value.
- if (getParser().parseAbsoluteExpression(CntVal))
+ if (getParser().parseAbsoluteExpression(Waitcnt))
return MatchOperand_ParseFail;
break;
case AsmToken::Identifier:
do {
- if (parseCnt(CntVal))
+ if (parseCnt(Waitcnt))
return MatchOperand_ParseFail;
} while(getLexer().isNot(AsmToken::EndOfStatement));
break;
}
- Operands.push_back(AMDGPUOperand::CreateImm(CntVal, S));
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Waitcnt, S));
return MatchOperand_Success;
}
@@ -1849,7 +2520,7 @@ bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
return false;
}
-AMDGPUAsmParser::OperandMatchResultTy
+OperandMatchResultTy
AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
using namespace llvm::AMDGPU::Hwreg;
@@ -1889,7 +2560,7 @@ AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
}
break;
}
- Operands.push_back(AMDGPUOperand::CreateImm(Imm16Val, S, AMDGPUOperand::ImmTyHwreg));
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTyHwreg));
return MatchOperand_Success;
}
@@ -1997,7 +2668,147 @@ bool AMDGPUAsmParser::parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &O
return false;
}
-AMDGPUAsmParser::OperandMatchResultTy
+OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) {
+ if (getLexer().getKind() != AsmToken::Identifier)
+ return MatchOperand_NoMatch;
+
+ StringRef Str = Parser.getTok().getString();
+ int Slot = StringSwitch<int>(Str)
+ .Case("p10", 0)
+ .Case("p20", 1)
+ .Case("p0", 2)
+ .Default(-1);
+
+ SMLoc S = Parser.getTok().getLoc();
+ if (Slot == -1)
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Slot, S,
+ AMDGPUOperand::ImmTyInterpSlot));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
+ if (getLexer().getKind() != AsmToken::Identifier)
+ return MatchOperand_NoMatch;
+
+ StringRef Str = Parser.getTok().getString();
+ if (!Str.startswith("attr"))
+ return MatchOperand_NoMatch;
+
+ StringRef Chan = Str.take_back(2);
+ int AttrChan = StringSwitch<int>(Chan)
+ .Case(".x", 0)
+ .Case(".y", 1)
+ .Case(".z", 2)
+ .Case(".w", 3)
+ .Default(-1);
+ if (AttrChan == -1)
+ return MatchOperand_ParseFail;
+
+ Str = Str.drop_back(2).drop_front(4);
+
+ uint8_t Attr;
+ if (Str.getAsInteger(10, Attr))
+ return MatchOperand_ParseFail;
+
+ SMLoc S = Parser.getTok().getLoc();
+ Parser.Lex();
+ if (Attr > 63) {
+ Error(S, "out of bounds attr");
+ return MatchOperand_Success;
+ }
+
+ SMLoc SChan = SMLoc::getFromPointer(Chan.data());
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Attr, S,
+ AMDGPUOperand::ImmTyInterpAttr));
+ Operands.push_back(AMDGPUOperand::CreateImm(this, AttrChan, SChan,
+ AMDGPUOperand::ImmTyAttrChan));
+ return MatchOperand_Success;
+}
+
+void AMDGPUAsmParser::errorExpTgt() {
+ Error(Parser.getTok().getLoc(), "invalid exp target");
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseExpTgtImpl(StringRef Str,
+ uint8_t &Val) {
+ if (Str == "null") {
+ Val = 9;
+ return MatchOperand_Success;
+ }
+
+ if (Str.startswith("mrt")) {
+ Str = Str.drop_front(3);
+ if (Str == "z") { // == mrtz
+ Val = 8;
+ return MatchOperand_Success;
+ }
+
+ if (Str.getAsInteger(10, Val))
+ return MatchOperand_ParseFail;
+
+ if (Val > 7)
+ errorExpTgt();
+
+ return MatchOperand_Success;
+ }
+
+ if (Str.startswith("pos")) {
+ Str = Str.drop_front(3);
+ if (Str.getAsInteger(10, Val))
+ return MatchOperand_ParseFail;
+
+ if (Val > 3)
+ errorExpTgt();
+
+ Val += 12;
+ return MatchOperand_Success;
+ }
+
+ if (Str.startswith("param")) {
+ Str = Str.drop_front(5);
+ if (Str.getAsInteger(10, Val))
+ return MatchOperand_ParseFail;
+
+ if (Val >= 32)
+ errorExpTgt();
+
+ Val += 32;
+ return MatchOperand_Success;
+ }
+
+ if (Str.startswith("invalid_target_")) {
+ Str = Str.drop_front(15);
+ if (Str.getAsInteger(10, Val))
+ return MatchOperand_ParseFail;
+
+ errorExpTgt();
+ return MatchOperand_Success;
+ }
+
+ return MatchOperand_NoMatch;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) {
+ uint8_t Val;
+ StringRef Str = Parser.getTok().getString();
+
+ auto Res = parseExpTgtImpl(Str, Val);
+ if (Res != MatchOperand_Success)
+ return Res;
+
+ SMLoc S = Parser.getTok().getLoc();
+ Parser.Lex();
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S,
+ AMDGPUOperand::ImmTyExpTgt));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
using namespace llvm::AMDGPU::SendMsg;
@@ -2068,11 +2879,11 @@ AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
}
Imm16Val |= (StreamId << STREAM_ID_SHIFT_);
}
- } while (0);
+ } while (false);
}
break;
}
- Operands.push_back(AMDGPUOperand::CreateImm(Imm16Val, S, AMDGPUOperand::ImmTySendMsg));
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTySendMsg));
return MatchOperand_Success;
}
@@ -2084,7 +2895,7 @@ bool AMDGPUOperand::isSendMsg() const {
// sopp branch targets
//===----------------------------------------------------------------------===//
-AMDGPUAsmParser::OperandMatchResultTy
+OperandMatchResultTy
AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
SMLoc S = Parser.getTok().getLoc();
@@ -2094,12 +2905,12 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
int64_t Imm;
if (getParser().parseAbsoluteExpression(Imm))
return MatchOperand_ParseFail;
- Operands.push_back(AMDGPUOperand::CreateImm(Imm, S));
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S));
return MatchOperand_Success;
}
case AsmToken::Identifier:
- Operands.push_back(AMDGPUOperand::CreateExpr(
+ Operands.push_back(AMDGPUOperand::CreateExpr(this,
MCSymbolRefExpr::create(getContext().getOrCreateSymbol(
Parser.getTok().getString()), getContext()), S));
Parser.Lex();
@@ -2112,15 +2923,15 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
//===----------------------------------------------------------------------===//
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const {
- return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyGLC);
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC);
}
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
- return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTySLC);
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC);
}
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const {
- return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyTFE);
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyTFE);
}
void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
@@ -2192,7 +3003,7 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) {
} else if (Op.isImmModifier()) {
OptionalIdx[Op.getImmTy()] = I;
} else {
- assert(false);
+ llvm_unreachable("unexpected operand type");
}
}
@@ -2228,7 +3039,7 @@ void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands)
} else if (Op.isImmModifier()) {
OptionalIdx[Op.getImmTy()] = I;
} else {
- assert(false);
+ llvm_unreachable("unexpected operand type");
}
}
@@ -2243,48 +3054,53 @@ void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands)
}
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const {
- return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDMask);
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDMask);
}
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const {
- return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyUNorm);
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyUNorm);
}
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const {
- return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDA);
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDA);
}
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const {
- return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyR128);
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyR128);
}
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const {
- return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyLWE);
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyLWE);
}
//===----------------------------------------------------------------------===//
// smrd
//===----------------------------------------------------------------------===//
-bool AMDGPUOperand::isSMRDOffset() const {
-
- // FIXME: Support 20-bit offsets on VI. We need to to pass subtarget
- // information here.
+bool AMDGPUOperand::isSMRDOffset8() const {
return isImm() && isUInt<8>(getImm());
}
+bool AMDGPUOperand::isSMRDOffset20() const {
+ return isImm() && isUInt<20>(getImm());
+}
+
bool AMDGPUOperand::isSMRDLiteralOffset() const {
// 32-bit literals are only supported on CI and we only want to use them
// when the offset is > 8-bits.
return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm());
}
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset() const {
- return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset8() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset20() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
}
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const {
- return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
}
//===----------------------------------------------------------------------===//
@@ -2317,10 +3133,13 @@ static bool ConvertBoundCtrl(int64_t &BoundCtrl) {
if (BoundCtrl == 0) {
BoundCtrl = 1;
return true;
- } else if (BoundCtrl == -1) {
+ }
+
+ if (BoundCtrl == -1) {
BoundCtrl = 0;
return true;
}
+
return false;
}
@@ -2350,9 +3169,10 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
{"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
{"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr},
+ {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr},
};
-AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
+OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
OperandMatchResultTy res;
for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) {
// try to parse any optional operand here
@@ -2376,16 +3196,19 @@ AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(Oper
return MatchOperand_NoMatch;
}
-AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands)
-{
+OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) {
StringRef Name = Parser.getTok().getString();
if (Name == "mul") {
- return parseIntWithPrefix("mul", Operands, AMDGPUOperand::ImmTyOModSI, ConvertOmodMul);
- } else if (Name == "div") {
- return parseIntWithPrefix("div", Operands, AMDGPUOperand::ImmTyOModSI, ConvertOmodDiv);
- } else {
- return MatchOperand_NoMatch;
+ return parseIntWithPrefix("mul", Operands,
+ AMDGPUOperand::ImmTyOModSI, ConvertOmodMul);
+ }
+
+ if (Name == "div") {
+ return parseIntWithPrefix("div", Operands,
+ AMDGPUOperand::ImmTyOModSI, ConvertOmodDiv);
}
+
+ return MatchOperand_NoMatch;
}
void AMDGPUAsmParser::cvtId(MCInst &Inst, const OperandVector &Operands) {
@@ -2407,6 +3230,17 @@ void AMDGPUAsmParser::cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands)
}
}
+static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
+ // 1. This operand is input modifiers
+ return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS
+ // 2. This is not last operand
+ && Desc.NumOperands > (OpNum + 1)
+ // 3. Next operand is register class
+ && Desc.OpInfo[OpNum + 1].RegClass != -1
+ // 4. Next register is not tied to any other operand
+ && Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1;
+}
+
void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
OptionalImmIndexMap OptionalIdx;
unsigned I = 1;
@@ -2417,18 +3251,36 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
for (unsigned E = Operands.size(); I != E; ++I) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
- if (Op.isRegOrImmWithInputMods()) {
- // only fp modifiers allowed in VOP3
+ if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
} else if (Op.isImm()) {
OptionalIdx[Op.getImmTy()] = I;
} else {
- assert(false);
+ llvm_unreachable("unhandled operand type");
}
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+
+ // special case v_mac_{f16, f32}:
+ // it has src2 register operand that is tied to dst operand
+ // we don't allow modifiers for this operand in assembler so src2_modifiers
+ // should be 0
+ if (Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
+ Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
+ Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi) {
+ auto it = Inst.begin();
+ std::advance(
+ it,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ?
+ AMDGPU::V_MAC_F16_e64 :
+ AMDGPU::V_MAC_F32_e64,
+ AMDGPU::OpName::src2_modifiers));
+ it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
+ ++it;
+ Inst.insert(it, Inst.getOperand(0)); // src2 = dst
+ }
}
//===----------------------------------------------------------------------===//
@@ -2455,7 +3307,11 @@ bool AMDGPUOperand::isDPPCtrl() const {
return false;
}
-AMDGPUAsmParser::OperandMatchResultTy
+bool AMDGPUOperand::isGPRIdxMode() const {
+ return isImm() && isUInt<4>(getImm());
+}
+
+OperandMatchResultTy
AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
SMLoc S = Parser.getTok().getLoc();
StringRef Prefix;
@@ -2469,8 +3325,10 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
if (Prefix == "row_mirror") {
Int = 0x140;
+ Parser.Lex();
} else if (Prefix == "row_half_mirror") {
Int = 0x141;
+ Parser.Lex();
} else {
// Check to prevent parseDPPCtrlOps from eating invalid tokens
if (Prefix != "quad_perm"
@@ -2494,60 +3352,46 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
Parser.Lex();
if (getLexer().isNot(AsmToken::LBrac))
return MatchOperand_ParseFail;
-
Parser.Lex();
- if (getLexer().isNot(AsmToken::Integer))
- return MatchOperand_ParseFail;
- Int = getLexer().getTok().getIntVal();
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Comma))
- return MatchOperand_ParseFail;
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Integer))
+ if (getParser().parseAbsoluteExpression(Int) || !(0 <= Int && Int <=3))
return MatchOperand_ParseFail;
- Int += (getLexer().getTok().getIntVal() << 2);
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Comma))
- return MatchOperand_ParseFail;
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Integer))
- return MatchOperand_ParseFail;
- Int += (getLexer().getTok().getIntVal() << 4);
+ for (int i = 0; i < 3; ++i) {
+ if (getLexer().isNot(AsmToken::Comma))
+ return MatchOperand_ParseFail;
+ Parser.Lex();
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Comma))
- return MatchOperand_ParseFail;
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Integer))
- return MatchOperand_ParseFail;
- Int += (getLexer().getTok().getIntVal() << 6);
+ int64_t Temp;
+ if (getParser().parseAbsoluteExpression(Temp) || !(0 <= Temp && Temp <=3))
+ return MatchOperand_ParseFail;
+ const int shift = i*2 + 2;
+ Int += (Temp << shift);
+ }
- Parser.Lex();
if (getLexer().isNot(AsmToken::RBrac))
return MatchOperand_ParseFail;
+ Parser.Lex();
} else {
// sel:%d
Parser.Lex();
- if (getLexer().isNot(AsmToken::Integer))
+ if (getParser().parseAbsoluteExpression(Int))
return MatchOperand_ParseFail;
- Int = getLexer().getTok().getIntVal();
- if (Prefix == "row_shl") {
+ if (Prefix == "row_shl" && 1 <= Int && Int <= 15) {
Int |= 0x100;
- } else if (Prefix == "row_shr") {
+ } else if (Prefix == "row_shr" && 1 <= Int && Int <= 15) {
Int |= 0x110;
- } else if (Prefix == "row_ror") {
+ } else if (Prefix == "row_ror" && 1 <= Int && Int <= 15) {
Int |= 0x120;
- } else if (Prefix == "wave_shl") {
+ } else if (Prefix == "wave_shl" && 1 == Int) {
Int = 0x130;
- } else if (Prefix == "wave_rol") {
+ } else if (Prefix == "wave_rol" && 1 == Int) {
Int = 0x134;
- } else if (Prefix == "wave_shr") {
+ } else if (Prefix == "wave_shr" && 1 == Int) {
Int = 0x138;
- } else if (Prefix == "wave_ror") {
+ } else if (Prefix == "wave_ror" && 1 == Int) {
Int = 0x13C;
} else if (Prefix == "row_bcast") {
if (Int == 15) {
@@ -2562,23 +3406,21 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
}
}
}
- Parser.Lex(); // eat last token
- Operands.push_back(AMDGPUOperand::CreateImm(Int, S,
- AMDGPUOperand::ImmTyDppCtrl));
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTyDppCtrl));
return MatchOperand_Success;
}
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const {
- return AMDGPUOperand::CreateImm(0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask);
+ return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask);
}
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const {
- return AMDGPUOperand::CreateImm(0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask);
+ return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask);
}
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBoundCtrl() const {
- return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl);
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl);
}
void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
@@ -2593,9 +3435,12 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
for (unsigned E = Operands.size(); I != E; ++I) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
// Add the register arguments
- if (Op.isRegOrImmWithInputMods()) {
- // Only float modifiers supported in DPP
- Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+ if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
+ // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token.
+ // Skip it.
+ continue;
+ } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegWithFPInputModsOperands(Inst, 2);
} else if (Op.isDPPCtrl()) {
Op.addImmOperands(Inst, 1);
} else if (Op.isImm()) {
@@ -2609,18 +3454,30 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
+
+ // special case v_mac_{f16, f32}:
+ // it has src2 register operand that is tied to dst operand
+ if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp ||
+ Inst.getOpcode() == AMDGPU::V_MAC_F16_dpp) {
+ auto it = Inst.begin();
+ std::advance(
+ it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
+ Inst.insert(it, Inst.getOperand(0)); // src2 = dst
+ }
}
//===----------------------------------------------------------------------===//
// sdwa
//===----------------------------------------------------------------------===//
-AMDGPUAsmParser::OperandMatchResultTy
+OperandMatchResultTy
AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
AMDGPUOperand::ImmTy Type) {
+ using namespace llvm::AMDGPU::SDWA;
+
SMLoc S = Parser.getTok().getLoc();
StringRef Value;
- AMDGPUAsmParser::OperandMatchResultTy res;
+ OperandMatchResultTy res;
res = parseStringWithPrefix(Prefix, Value);
if (res != MatchOperand_Success) {
@@ -2629,13 +3486,13 @@ AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
int64_t Int;
Int = StringSwitch<int64_t>(Value)
- .Case("BYTE_0", 0)
- .Case("BYTE_1", 1)
- .Case("BYTE_2", 2)
- .Case("BYTE_3", 3)
- .Case("WORD_0", 4)
- .Case("WORD_1", 5)
- .Case("DWORD", 6)
+ .Case("BYTE_0", SdwaSel::BYTE_0)
+ .Case("BYTE_1", SdwaSel::BYTE_1)
+ .Case("BYTE_2", SdwaSel::BYTE_2)
+ .Case("BYTE_3", SdwaSel::BYTE_3)
+ .Case("WORD_0", SdwaSel::WORD_0)
+ .Case("WORD_1", SdwaSel::WORD_1)
+ .Case("DWORD", SdwaSel::DWORD)
.Default(0xffffffff);
Parser.Lex(); // eat last token
@@ -2643,15 +3500,17 @@ AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
return MatchOperand_ParseFail;
}
- Operands.push_back(AMDGPUOperand::CreateImm(Int, S, Type));
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, Type));
return MatchOperand_Success;
}
-AMDGPUAsmParser::OperandMatchResultTy
+OperandMatchResultTy
AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
+ using namespace llvm::AMDGPU::SDWA;
+
SMLoc S = Parser.getTok().getLoc();
StringRef Value;
- AMDGPUAsmParser::OperandMatchResultTy res;
+ OperandMatchResultTy res;
res = parseStringWithPrefix("dst_unused", Value);
if (res != MatchOperand_Success) {
@@ -2660,9 +3519,9 @@ AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
int64_t Int;
Int = StringSwitch<int64_t>(Value)
- .Case("UNUSED_PAD", 0)
- .Case("UNUSED_SEXT", 1)
- .Case("UNUSED_PRESERVE", 2)
+ .Case("UNUSED_PAD", DstUnused::UNUSED_PAD)
+ .Case("UNUSED_SEXT", DstUnused::UNUSED_SEXT)
+ .Case("UNUSED_PRESERVE", DstUnused::UNUSED_PRESERVE)
.Default(0xffffffff);
Parser.Lex(); // eat last token
@@ -2670,8 +3529,7 @@ AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
- Operands.push_back(AMDGPUOperand::CreateImm(Int, S,
- AMDGPUOperand::ImmTySdwaDstUnused));
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTySdwaDstUnused));
return MatchOperand_Success;
}
@@ -2700,13 +3558,15 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
for (unsigned E = Operands.size(); I != E; ++I) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
// Add the register arguments
- if (BasicInstType == SIInstrFlags::VOPC &&
+ if ((BasicInstType == SIInstrFlags::VOPC ||
+ BasicInstType == SIInstrFlags::VOP2)&&
Op.isReg() &&
Op.Reg.RegNo == AMDGPU::VCC) {
- // VOPC sdwa use "vcc" token as dst. Skip it.
+ // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
+ // Skip it.
continue;
- } else if (Op.isRegOrImmWithInputMods()) {
- Op.addRegOrImmWithInputModsOperands(Inst, 2);
+ } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegWithInputModsOperands(Inst, 2);
} else if (Op.isImm()) {
// Handle optional arguments
OptionalIdx[Op.getImmTy()] = I;
@@ -2716,46 +3576,55 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
-
- if (Inst.getOpcode() == AMDGPU::V_NOP_sdwa) {
- // V_NOP_sdwa has no optional sdwa arguments
- return;
- }
- switch (BasicInstType) {
- case SIInstrFlags::VOP1: {
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
- break;
- }
- case SIInstrFlags::VOP2: {
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
- break;
- }
- case SIInstrFlags::VOPC: {
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
- break;
+
+ if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
+ // V_NOP_sdwa_vi has no optional sdwa arguments
+ switch (BasicInstType) {
+ case SIInstrFlags::VOP1:
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+ break;
+
+ case SIInstrFlags::VOP2:
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+ break;
+
+ case SIInstrFlags::VOPC:
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+ break;
+
+ default:
+ llvm_unreachable("Invalid instruction type. Only VOP1, VOP2 and VOPC allowed");
+ }
}
- default:
- llvm_unreachable("Invalid instruction type. Only VOP1, VOP2 and VOPC allowed");
+
+ // special case v_mac_{f16, f32}:
+ // it has src2 register operand that is tied to dst operand
+ if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa_vi ||
+ Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) {
+ auto it = Inst.begin();
+ std::advance(
+ it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
+ Inst.insert(it, Inst.getOperand(0)); // src2 = dst
}
+
}
/// Force static initialization.
extern "C" void LLVMInitializeAMDGPUAsmParser() {
- RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget);
- RegisterMCAsmParser<AMDGPUAsmParser> B(TheGCNTarget);
+ RegisterMCAsmParser<AMDGPUAsmParser> A(getTheAMDGPUTarget());
+ RegisterMCAsmParser<AMDGPUAsmParser> B(getTheGCNTarget());
}
#define GET_REGISTER_MATCHER
#define GET_MATCHER_IMPLEMENTATION
#include "AMDGPUGenAsmMatcher.inc"
-
// This fuction should be defined after auto-generated include so that we have
// MatchClassKind enum defined
unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
@@ -2776,16 +3645,27 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
return Operand.isIdxen() ? Match_Success : Match_InvalidOperand;
case MCK_offen:
return Operand.isOffen() ? Match_Success : Match_InvalidOperand;
- case MCK_SSrc32:
+ case MCK_SSrcB32:
// When operands have expression values, they will return true for isToken,
// because it is not possible to distinguish between a token and an
// expression at parse time. MatchInstructionImpl() will always try to
// match an operand as a token, when isToken returns true, and when the
// name of the expression is not a valid token, the match will fail,
// so we need to handle it here.
- return Operand.isSSrc32() ? Match_Success : Match_InvalidOperand;
+ return Operand.isSSrcB32() ? Match_Success : Match_InvalidOperand;
+ case MCK_SSrcF32:
+ return Operand.isSSrcF32() ? Match_Success : Match_InvalidOperand;
case MCK_SoppBrTarget:
return Operand.isSoppBrTarget() ? Match_Success : Match_InvalidOperand;
- default: return Match_InvalidOperand;
+ case MCK_VReg32OrOff:
+ return Operand.isVReg32OrOff() ? Match_Success : Match_InvalidOperand;
+ case MCK_InterpSlot:
+ return Operand.isInterpSlot() ? Match_Success : Match_InvalidOperand;
+ case MCK_Attr:
+ return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand;
+ case MCK_AttrChan:
+ return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand;
+ default:
+ return Match_InvalidOperand;
}
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td
new file mode 100644
index 0000000..45a7fe6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -0,0 +1,1350 @@
+//===-- BUFInstructions.td - Buffer Instruction Defintions ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
+def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
+def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
+
+def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
+def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
+def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
+def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
+def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">;
+def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">;
+
+class MubufLoad <SDPatternOperator op> : PatFrag <
+ (ops node:$ptr), (op node:$ptr), [{
+ auto const AS = cast<MemSDNode>(N)->getAddressSpace();
+ return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS;
+}]>;
+
+def mubuf_load : MubufLoad <load>;
+def mubuf_az_extloadi8 : MubufLoad <az_extloadi8>;
+def mubuf_sextloadi8 : MubufLoad <sextloadi8>;
+def mubuf_az_extloadi16 : MubufLoad <az_extloadi16>;
+def mubuf_sextloadi16 : MubufLoad <sextloadi16>;
+def mubuf_load_atomic : MubufLoad <atomic_load>;
+
+def BUFAddrKind {
+ int Offset = 0;
+ int OffEn = 1;
+ int IdxEn = 2;
+ int BothEn = 3;
+ int Addr64 = 4;
+}
+
+class getAddrName<int addrKind> {
+ string ret =
+ !if(!eq(addrKind, BUFAddrKind.Offset), "offset",
+ !if(!eq(addrKind, BUFAddrKind.OffEn), "offen",
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), "idxen",
+ !if(!eq(addrKind, BUFAddrKind.BothEn), "bothen",
+ !if(!eq(addrKind, BUFAddrKind.Addr64), "addr64",
+ "")))));
+}
+
+class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
+ bit IsAddr64 = is_addr64;
+ string OpName = NAME # suffix;
+}
+
+//===----------------------------------------------------------------------===//
+// MTBUF classes
+//===----------------------------------------------------------------------===//
+
+class MTBUF_Pseudo <string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ InstSI<outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let Size = 8;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+ let MTBUF = 1;
+ let Uses = [EXEC];
+
+ let hasSideEffects = 0;
+ let SchedRW = [WriteVMEM];
+}
+
+class MTBUF_Real <MTBUF_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ Enc64 {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+
+ bits<8> vdata;
+ bits<12> offset;
+ bits<1> offen;
+ bits<1> idxen;
+ bits<1> glc;
+ bits<1> addr64;
+ bits<4> dfmt;
+ bits<3> nfmt;
+ bits<8> vaddr;
+ bits<7> srsrc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
+
+ let Inst{11-0} = offset;
+ let Inst{12} = offen;
+ let Inst{13} = idxen;
+ let Inst{14} = glc;
+ let Inst{22-19} = dfmt;
+ let Inst{25-23} = nfmt;
+ let Inst{31-26} = 0x3a; //encoding
+ let Inst{39-32} = vaddr;
+ let Inst{47-40} = vdata;
+ let Inst{52-48} = srsrc{6-2};
+ let Inst{54} = slc;
+ let Inst{55} = tfe;
+ let Inst{63-56} = soffset;
+}
+
+class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
+ opName, (outs regClass:$dst),
+ (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+ i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
+ i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
+ " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
+ " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+ let mayLoad = 1;
+ let mayStore = 0;
+}
+
+class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
+ opName, (outs),
+ (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+ i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
+ SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
+ " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
+ " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+ let mayLoad = 0;
+ let mayStore = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// MUBUF classes
+//===----------------------------------------------------------------------===//
+
+class MUBUF_Pseudo <string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ InstSI<outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let Size = 8;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+ let MUBUF = 1;
+ let Uses = [EXEC];
+ let hasSideEffects = 0;
+ let SchedRW = [WriteVMEM];
+
+ let AsmMatchConverter = "cvtMubuf";
+
+ bits<1> offen = 0;
+ bits<1> idxen = 0;
+ bits<1> addr64 = 0;
+ bits<1> has_vdata = 1;
+ bits<1> has_vaddr = 1;
+ bits<1> has_glc = 1;
+ bits<1> glc_value = 0; // the value for glc if no such operand
+ bits<1> has_srsrc = 1;
+ bits<1> has_soffset = 1;
+ bits<1> has_offset = 1;
+ bits<1> has_slc = 1;
+ bits<1> has_tfe = 1;
+}
+
+class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+
+ bits<12> offset;
+ bits<1> glc;
+ bits<1> lds = 0;
+ bits<8> vaddr;
+ bits<8> vdata;
+ bits<7> srsrc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
+}
+
+
+// For cache invalidation instructions.
+class MUBUF_Invalidate <string opName, SDPatternOperator node> :
+ MUBUF_Pseudo<opName, (outs), (ins), "", [(node)]> {
+
+ let AsmMatchConverter = "";
+
+ let hasSideEffects = 1;
+ let mayStore = 1;
+
+ // Set everything to 0.
+ let offen = 0;
+ let idxen = 0;
+ let addr64 = 0;
+ let has_vdata = 0;
+ let has_vaddr = 0;
+ let has_glc = 0;
+ let glc_value = 0;
+ let has_srsrc = 0;
+ let has_soffset = 0;
+ let has_offset = 0;
+ let has_slc = 0;
+ let has_tfe = 0;
+}
+
+class getMUBUFInsDA<list<RegisterClass> vdataList,
+ list<RegisterClass> vaddrList=[]> {
+ RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+ RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ dag InsNoData = !if(!empty(vaddrList),
+ (ins SReg_128:$srsrc, SCSrc_b32:$soffset,
+ offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+ (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
+ offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+ );
+ dag InsData = !if(!empty(vaddrList),
+ (ins vdataClass:$vdata, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+ (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+ );
+ dag ret = !if(!empty(vdataList), InsNoData, InsData);
+}
+
+class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+ dag ret =
+ !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
+ (ins))))));
+}
+
+class getMUBUFAsmOps<int addrKind> {
+ string Pfx =
+ !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $soffset",
+ !if(!eq(addrKind, BUFAddrKind.OffEn), "$vaddr, $srsrc, $soffset offen",
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), "$vaddr, $srsrc, $soffset idxen",
+ !if(!eq(addrKind, BUFAddrKind.BothEn), "$vaddr, $srsrc, $soffset idxen offen",
+ !if(!eq(addrKind, BUFAddrKind.Addr64), "$vaddr, $srsrc, $soffset addr64",
+ "")))));
+ string ret = Pfx # "$offset";
+}
+
+class MUBUF_SetupAddr<int addrKind> {
+ bits<1> offen = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+ bits<1> idxen = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+ bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+
+ bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+}
+
+class MUBUF_Load_Pseudo <string opName,
+ int addrKind,
+ RegisterClass vdataClass,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind>
+ : MUBUF_Pseudo<opName,
+ (outs vdataClass:$vdata),
+ getMUBUFIns<addrKindCopy>.ret,
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ pattern>,
+ MUBUF_SetupAddr<addrKindCopy> {
+ let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ let mayLoad = 1;
+ let mayStore = 0;
+}
+
+// FIXME: tfe can't be an operand because it requires a separate
+// opcode because it needs an N+1 register class dest register.
+multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+ ValueType load_vt = i32,
+ SDPatternOperator ld = null_frag> {
+
+ def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ [(set load_vt:$vdata,
+ (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
+ MUBUFAddr64Table<0>;
+
+ def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ [(set load_vt:$vdata,
+ (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
+ MUBUFAddr64Table<1>;
+
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+ let DisableWQM = 1 in {
+ def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+ def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ }
+}
+
+class MUBUF_Store_Pseudo <string opName,
+ int addrKind,
+ RegisterClass vdataClass,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind,
+ RegisterClass vdataClassCopy = vdataClass>
+ : MUBUF_Pseudo<opName,
+ (outs),
+ getMUBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ pattern>,
+ MUBUF_SetupAddr<addrKindCopy> {
+ let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ let mayLoad = 0;
+ let mayStore = 1;
+}
+
+multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+ ValueType store_vt = i32,
+ SDPatternOperator st = null_frag> {
+
+ def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
+ MUBUFAddr64Table<0>;
+
+ def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
+ MUBUFAddr64Table<1>;
+
+ def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+ let DisableWQM = 1 in {
+ def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+ def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ }
+}
+
+
+class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
+ list<RegisterClass> vaddrList=[]> {
+ RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ dag ret = !if(vdata_in,
+ !if(!empty(vaddrList),
+ (ins vdataClass:$vdata_in,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+ (ins vdataClass:$vdata_in, vaddrClass:$vaddr,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+ ),
+ !if(!empty(vaddrList),
+ (ins vdataClass:$vdata,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+ (ins vdataClass:$vdata, vaddrClass:$vaddr,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+ ));
+}
+
+class getMUBUFAtomicIns<int addrKind,
+ RegisterClass vdataClass,
+ bit vdata_in,
+ // Workaround bug bz30254
+ RegisterClass vdataClassCopy=vdataClass> {
+ dag ret =
+ !if(!eq(addrKind, BUFAddrKind.Offset),
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn),
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VGPR_32]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn),
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VGPR_32]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn),
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VReg_64]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64),
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VReg_64]>.ret,
+ (ins))))));
+}
+
+class MUBUF_Atomic_Pseudo<string opName,
+ int addrKind,
+ dag outs,
+ dag ins,
+ string asmOps,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind>
+ : MUBUF_Pseudo<opName, outs, ins, asmOps, pattern>,
+ MUBUF_SetupAddr<addrKindCopy> {
+ let mayStore = 1;
+ let mayLoad = 1;
+ let hasPostISelHook = 1;
+ let hasSideEffects = 1;
+ let DisableWQM = 1;
+ let has_glc = 0;
+ let has_tfe = 0;
+}
+
+class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
+ RegisterClass vdataClass,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind,
+ RegisterClass vdataClassCopy = vdataClass>
+ : MUBUF_Atomic_Pseudo<opName, addrKindCopy,
+ (outs),
+ getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0>.ret,
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$slc",
+ pattern>,
+ AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 0> {
+ let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ let glc_value = 0;
+ let AsmMatchConverter = "cvtMubufAtomic";
+}
+
+class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
+ RegisterClass vdataClass,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind,
+ RegisterClass vdataClassCopy = vdataClass>
+ : MUBUF_Atomic_Pseudo<opName, addrKindCopy,
+ (outs vdataClassCopy:$vdata),
+ getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1>.ret,
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # " glc$slc",
+ pattern>,
+ AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> {
+ let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
+ let glc_value = 1;
+ let Constraints = "$vdata = $vdata_in";
+ let DisableEncoding = "$vdata_in";
+ let AsmMatchConverter = "cvtMubufAtomicReturn";
+}
+
+multiclass MUBUF_Pseudo_Atomics <string opName,
+ RegisterClass vdataClass,
+ ValueType vdataType,
+ SDPatternOperator atomic> {
+
+ def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
+ MUBUFAddr64Table <0>;
+ def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
+ MUBUFAddr64Table <1>;
+ def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+ def _RTN_OFFSET : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ [(set vdataType:$vdata,
+ (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
+ vdataType:$vdata_in))]>,
+ MUBUFAddr64Table <0, "_RTN">;
+
+ def _RTN_ADDR64 : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ [(set vdataType:$vdata,
+ (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc),
+ vdataType:$vdata_in))]>,
+ MUBUFAddr64Table <1, "_RTN">;
+
+ def _RTN_OFFEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _RTN_IDXEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _RTN_BOTHEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGCN in {
+
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads <
+ "buffer_load_format_x", VGPR_32
+>;
+defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads <
+ "buffer_load_format_xy", VReg_64
+>;
+defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Pseudo_Loads <
+ "buffer_load_format_xyz", VReg_96
+>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Pseudo_Loads <
+ "buffer_load_format_xyzw", VReg_128
+>;
+defm BUFFER_STORE_FORMAT_X : MUBUF_Pseudo_Stores <
+ "buffer_store_format_x", VGPR_32
+>;
+defm BUFFER_STORE_FORMAT_XY : MUBUF_Pseudo_Stores <
+ "buffer_store_format_xy", VReg_64
+>;
+defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores <
+ "buffer_store_format_xyz", VReg_96
+>;
+defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores <
+ "buffer_store_format_xyzw", VReg_128
+>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads <
+ "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8
+>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads <
+ "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8
+>;
+defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads <
+ "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16
+>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads <
+ "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16
+>;
+defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads <
+ "buffer_load_dword", VGPR_32, i32, mubuf_load
+>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
+ "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load
+>;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
+ "buffer_load_dwordx3", VReg_96, untyped, mubuf_load
+>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
+ "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
+>;
+defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
+ "buffer_store_byte", VGPR_32, i32, truncstorei8_global
+>;
+defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores <
+ "buffer_store_short", VGPR_32, i32, truncstorei16_global
+>;
+defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores <
+ "buffer_store_dword", VGPR_32, i32, global_store
+>;
+defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores <
+ "buffer_store_dwordx2", VReg_64, v2i32, global_store
+>;
+defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores <
+ "buffer_store_dwordx3", VReg_96, untyped, global_store
+>;
+defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
+ "buffer_store_dwordx4", VReg_128, v4i32, global_store
+>;
+defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
+>;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag
+>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_add", VGPR_32, i32, atomic_add_global
+>;
+defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
+>;
+defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
+>;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
+>;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
+>;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
+>;
+defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_and", VGPR_32, i32, atomic_and_global
+>;
+defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_or", VGPR_32, i32, atomic_or_global
+>;
+defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
+>;
+defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global
+>;
+defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global
+>;
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global
+>;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
+>;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global
+>;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global
+>;
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global
+>;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global
+>;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global
+>;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global
+>;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global
+>;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global
+>;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global
+>;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global
+>;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
+>;
+
+let SubtargetPredicate = isSI in { // isn't on CI & VI
+/*
+defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">;
+defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">;
+defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin">;
+defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax">;
+defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub_x2">;
+defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap_x2">;
+defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin_x2">;
+defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax_x2">;
+*/
+
+def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
+ int_amdgcn_buffer_wbinvl1_sc>;
+}
+
+def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
+ int_amdgcn_buffer_wbinvl1>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Instructions
+//===----------------------------------------------------------------------===//
+
+//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0, "tbuffer_load_format_x", []>;
+//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <1, "tbuffer_load_format_xy", []>;
+//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <2, "tbuffer_load_format_xyz", []>;
+def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Pseudo <"tbuffer_load_format_xyzw", VReg_128>;
+def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>;
+def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>;
+def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>;
+def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>;
+
+} // End let SubtargetPredicate = isGCN
+
+let SubtargetPredicate = isCIVI in {
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for CI and newer.
+//===----------------------------------------------------------------------===//
+// Remaining instructions:
+// BUFFER_LOAD_DWORDX3
+// BUFFER_STORE_DWORDX3
+
+def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
+ int_amdgcn_buffer_wbinvl1_vol>;
+
+} // End let SubtargetPredicate = isCIVI
+
+//===----------------------------------------------------------------------===//
+// MUBUF Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isGCN] in {
+
+// int_SI_vs_load_input
+def : Pat<
+ (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
+ (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0)
+>;
+
+// Offset in an 32-bit VGPR
+def : Pat <
+ (SIload_constant v4i32:$sbase, i32:$voff),
+ (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0)
+>;
+
+
+//===----------------------------------------------------------------------===//
+// buffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+ string opcode> {
+ def : Pat<
+ (vt (name v4i32:$rsrc, 0,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$glc, imm:$slc)),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+ (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (vt (name v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$glc, imm:$slc)),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+ (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (vt (name v4i32:$rsrc, 0,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$glc, imm:$slc)),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+ (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (vt (name v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$glc, imm:$slc)),
+ (!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset),
+ (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+}
+
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
+
+multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+ string opcode> {
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, 0,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$glc, imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+ (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$glc, imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i1imm $glc),
+ (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, 0,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$glc, imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i1imm $glc),
+ (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$glc, imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
+ $vdata,
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset),
+ (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+}
+
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
+
+//===----------------------------------------------------------------------===//
+// buffer_atomic patterns
+//===----------------------------------------------------------------------===//
+
+multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
+ def : Pat<
+ (name i32:$vdata_in, v4i32:$rsrc, 0,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i1imm $slc))
+ >;
+
+ def : Pat<
+ (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i1imm $slc))
+ >;
+
+ def : Pat<
+ (name i32:$vdata_in, v4i32:$rsrc, 0,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i1imm $slc))
+ >;
+
+ def : Pat<
+ (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$slc),
+ (!cast<MUBUF_Pseudo>(opcode # _RTN_BOTHEN)
+ $vdata_in,
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc))
+ >;
+}
+
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_swap, "BUFFER_ATOMIC_SWAP">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_add, "BUFFER_ATOMIC_ADD">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_sub, "BUFFER_ATOMIC_SUB">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smin, "BUFFER_ATOMIC_SMIN">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umin, "BUFFER_ATOMIC_UMIN">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smax, "BUFFER_ATOMIC_SMAX">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umax, "BUFFER_ATOMIC_UMAX">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_and, "BUFFER_ATOMIC_AND">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_or, "BUFFER_ATOMIC_OR">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_xor, "BUFFER_ATOMIC_XOR">;
+
+def : Pat<
+ (int_amdgcn_buffer_atomic_cmpswap
+ i32:$data, i32:$cmp, v4i32:$rsrc, 0,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$slc),
+ (EXTRACT_SUBREG
+ (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET
+ (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+ $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+ sub0)
+>;
+
+def : Pat<
+ (int_amdgcn_buffer_atomic_cmpswap
+ i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+ imm:$slc),
+ (EXTRACT_SUBREG
+ (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN
+ (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+ $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+ sub0)
+>;
+
+def : Pat<
+ (int_amdgcn_buffer_atomic_cmpswap
+ i32:$data, i32:$cmp, v4i32:$rsrc, 0,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$slc),
+ (EXTRACT_SUBREG
+ (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN
+ (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+ $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+ sub0)
+>;
+
+def : Pat<
+ (int_amdgcn_buffer_atomic_cmpswap
+ i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
+ (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+ imm:$slc),
+ (EXTRACT_SUBREG
+ (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN
+ (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+ sub0)
+>;
+
+
+class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
+ PatFrag constant_ld> : Pat <
+ (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+ >;
+
+multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
+ ValueType vt, PatFrag atomic_ld> {
+ def : Pat <
+ (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ i16:$offset, i1:$slc))),
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
+ >;
+
+ def : Pat <
+ (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
+ (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
+ >;
+}
+
+let Predicates = [isSICI] in {
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>;
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>;
+} // End Predicates = [isSICI]
+
+multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
+ PatFrag ld> {
+
+ def : Pat <
+ (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
+ (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+ >;
+}
+
+let Predicates = [Has16BitInsts] in {
+
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, mubuf_sextloadi8>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>;
+
+} // End Predicates = [Has16BitInsts]
+
+class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat <
+ (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
+ i32:$soffset, u16imm:$offset))),
+ (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+>;
+
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i16, sextloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i16, extloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
+
+// BUFFER_LOAD_DWORD*, addr64=0
+multiclass MUBUF_Load_Dword <ValueType vt,
+ MUBUF_Pseudo offset,
+ MUBUF_Pseudo offen,
+ MUBUF_Pseudo idxen,
+ MUBUF_Pseudo bothen> {
+
+ def : Pat <
+ (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset,
+ imm:$offset, 0, 0, imm:$glc, imm:$slc,
+ imm:$tfe)),
+ (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
+ (as_i1imm $slc), (as_i1imm $tfe))
+ >;
+
+ def : Pat <
+ (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
+ imm:$offset, 1, 0, imm:$glc, imm:$slc,
+ imm:$tfe)),
+ (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
+ (as_i1imm $tfe))
+ >;
+
+ def : Pat <
+ (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
+ imm:$offset, 0, 1, imm:$glc, imm:$slc,
+ imm:$tfe)),
+ (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
+ (as_i1imm $slc), (as_i1imm $tfe))
+ >;
+
+ def : Pat <
+ (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
+ imm:$offset, 1, 1, imm:$glc, imm:$slc,
+ imm:$tfe)),
+ (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
+ (as_i1imm $tfe))
+ >;
+}
+
+defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
+ BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
+defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
+ BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
+defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
+ BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
+
+multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
+ ValueType vt, PatFrag atomic_st> {
+ // Store follows atomic op convention so address is forst
+ def : Pat <
+ (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ i16:$offset, i1:$slc), vt:$val),
+ (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
+ >;
+
+ def : Pat <
+ (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
+ (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
+ >;
+}
+let Predicates = [isSICI] in {
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, global_store_atomic>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>;
+} // End Predicates = [isSICI]
+
+
+multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
+ PatFrag st> {
+
+ def : Pat <
+ (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe)),
+ (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+ >;
+}
+
+defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>;
+
+class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat <
+ (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
+ u16imm:$offset)),
+ (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+>;
+
+def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i16, truncstorei8_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i16, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Patterns
+//===----------------------------------------------------------------------===//
+
+// TBUFFER_STORE_FORMAT_*, addr64=0
+class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
+ (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
+ i32:$soffset, imm:$inst_offset, imm:$dfmt,
+ imm:$nfmt, imm:$offen, imm:$idxen,
+ imm:$glc, imm:$slc, imm:$tfe),
+ (opcode
+ $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
+ (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
+ (as_i1imm $slc), (as_i1imm $tfe), $soffset)
+>;
+
+def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
+def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
+def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
+def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+
+} // End let Predicates = [isGCN]
+
+//===----------------------------------------------------------------------===//
+// Target instructions, move to the appropriate target TD file
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> :
+ MUBUF_Real<op, ps>,
+ Enc64,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
+ let AssemblerPredicate=isSICI;
+ let DecoderNamespace="SICI";
+
+ let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{15} = ps.addr64;
+ let Inst{16} = lds;
+ let Inst{24-18} = op;
+ let Inst{31-26} = 0x38; //encoding
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MUBUF_Real_AllAddr_si<bits<7> op> {
+ def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>;
+ def _OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
+multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
+ def _RTN_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFSET")>;
+ def _RTN_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_ADDR64")>;
+ def _RTN_OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFEN")>;
+ def _RTN_IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_IDXEN")>;
+ def _RTN_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_BOTHEN")>;
+}
+
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_si <0x00>;
+defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_si <0x01>;
+defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x02>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x03>;
+defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_si <0x04>;
+defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_si <0x05>;
+defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x06>;
+defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x07>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_si <0x08>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_si <0x09>;
+defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_si <0x0a>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_si <0x0b>;
+defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_si <0x0c>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_si <0x0d>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_si <0x0e>;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_si <0x0f>;
+defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_si <0x18>;
+defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_si <0x1a>;
+defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_si <0x1c>;
+defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_si <0x1d>;
+defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_si <0x1e>;
+defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_si <0x1f>;
+
+defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_si <0x30>;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_si <0x31>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_si <0x32>;
+defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_si <0x33>;
+//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomic_si <0x34>; // isn't on CI & VI
+defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_si <0x35>;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_si <0x36>;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_si <0x37>;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_si <0x38>;
+defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_si <0x39>;
+defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_si <0x3a>;
+defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_si <0x3b>;
+defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_si <0x3c>;
+defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_si <0x3d>;
+
+//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_si <0x3e>; // isn't on VI
+//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomic_si <0x3f>; // isn't on VI
+//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomic_si <0x40>; // isn't on VI
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_si <0x50>;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_si <0x51>;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_si <0x52>;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_si <0x53>;
+//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Real_Atomic_si <0x54>; // isn't on CI & VI
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_si <0x55>;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_si <0x56>;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_si <0x57>;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_si <0x58>;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_si <0x59>;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_si <0x5a>;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>;
+// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI.
+//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI
+//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI
+//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI
+
+def BUFFER_WBINVL1_SC_si : MUBUF_Real_si <0x70, BUFFER_WBINVL1_SC>;
+def BUFFER_WBINVL1_si : MUBUF_Real_si <0x71, BUFFER_WBINVL1>;
+
+class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
+ MTBUF_Real<ps>,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
+ let AssemblerPredicate=isSICI;
+ let DecoderNamespace="SICI";
+
+ bits<1> addr64;
+ let Inst{15} = addr64;
+ let Inst{18-16} = op;
+}
+
+def TBUFFER_LOAD_FORMAT_XYZW_si : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>;
+def TBUFFER_STORE_FORMAT_X_si : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>;
+def TBUFFER_STORE_FORMAT_XY_si : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>;
+def TBUFFER_STORE_FORMAT_XYZ_si : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>;
+def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>;
+
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+class MUBUF_Real_ci <bits<7> op, MUBUF_Pseudo ps> :
+ MUBUF_Real_si<op, ps> {
+ let AssemblerPredicate=isCIOnly;
+ let DecoderNamespace="CI";
+}
+
+def BUFFER_WBINVL1_VOL_ci : MUBUF_Real_ci <0x70, BUFFER_WBINVL1_VOL>;
+
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
+ MUBUF_Real<op, ps>,
+ Enc64,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
+ let AssemblerPredicate=isVI;
+ let DecoderNamespace="VI";
+
+ let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{16} = lds;
+ let Inst{17} = !if(ps.has_slc, slc, ?);
+ let Inst{24-18} = op;
+ let Inst{31-26} = 0x38; //encoding
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
+ def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
+multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
+ MUBUF_Real_AllAddr_vi<op> {
+ def _RTN_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFSET")>;
+ def _RTN_OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFEN")>;
+ def _RTN_IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_IDXEN")>;
+ def _RTN_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_BOTHEN")>;
+}
+
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_vi <0x00>;
+defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x01>;
+defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x02>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x03>;
+defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_vi <0x04>;
+defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x05>;
+defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x06>;
+defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x07>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_vi <0x10>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_vi <0x11>;
+defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_vi <0x12>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_vi <0x13>;
+defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_vi <0x14>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>;
+defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>;
+defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>;
+defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_vi <0x1c>;
+defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_vi <0x1d>;
+defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_vi <0x1e>;
+defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_vi <0x1f>;
+
+defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_vi <0x40>;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_vi <0x41>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_vi <0x42>;
+defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_vi <0x43>;
+defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_vi <0x44>;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_vi <0x45>;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_vi <0x46>;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_vi <0x47>;
+defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_vi <0x48>;
+defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_vi <0x49>;
+defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_vi <0x4a>;
+defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_vi <0x4b>;
+defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_vi <0x4c>;
+
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_vi <0x60>;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_vi <0x61>;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_vi <0x62>;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_vi <0x63>;
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_vi <0x64>;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_vi <0x65>;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_vi <0x66>;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_vi <0x67>;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_vi <0x68>;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_vi <0x69>;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_vi <0x6a>;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_vi <0x6b>;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_vi <0x6c>;
+
+def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
+def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
+
+class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
+ MTBUF_Real<ps>,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
+ let AssemblerPredicate=isVI;
+ let DecoderNamespace="VI";
+
+ let Inst{18-15} = op;
+}
+
+def TBUFFER_LOAD_FORMAT_XYZW_vi : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>;
+def TBUFFER_STORE_FORMAT_X_vi : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>;
+def TBUFFER_STORE_FORMAT_XY_vi : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>;
+def TBUFFER_STORE_FORMAT_XYZ_vi : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>;
+def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>;
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
index f9a9f79..26a483a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
@@ -12,338 +12,4 @@
// S_CBRANCH_CDBGUSER
// S_CBRANCH_CDBGSYS
// S_CBRANCH_CDBGSYS_OR_USER
-// S_CBRANCH_CDBGSYS_AND_USER
-// DS_NOP
-// DS_GWS_SEMA_RELEASE_ALL
-// DS_WRAP_RTN_B32
-// DS_CNDXCHG32_RTN_B64
-// DS_WRITE_B96
-// DS_WRITE_B128
-// DS_CONDXCHG32_RTN_B128
-// DS_READ_B96
-// DS_READ_B128
-// BUFFER_LOAD_DWORDX3
-// BUFFER_STORE_DWORDX3
-
-//===----------------------------------------------------------------------===//
-// VOP1 Instructions
-//===----------------------------------------------------------------------===//
-
-let SubtargetPredicate = isCIVI in {
-
-let SchedRW = [WriteDoubleAdd] in {
-defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
- VOP_F64_F64, ftrunc
->;
-defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64",
- VOP_F64_F64, fceil
->;
-defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
- VOP_F64_F64, ffloor
->;
-defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
- VOP_F64_F64, frint
->;
-} // End SchedRW = [WriteDoubleAdd]
-
-let SchedRW = [WriteQuarterRate32] in {
-defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32",
- VOP_F32_F32
->;
-defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32",
- VOP_F32_F32
->;
-} // End SchedRW = [WriteQuarterRate32]
-
-//===----------------------------------------------------------------------===//
-// VOP3 Instructions
-//===----------------------------------------------------------------------===//
-
-defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
- VOP_I32_I32_I32
->;
-defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
- VOP_I32_I32_I32
->;
-defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
- VOP_I32_I32_I32
->;
-
-let isCommutable = 1 in {
-defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
- VOP_I64_I32_I32_I64
->;
-
-// XXX - Does this set VCC?
-defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
- VOP_I64_I32_I32_I64
->;
-} // End isCommutable = 1
-
-
-//===----------------------------------------------------------------------===//
-// DS Instructions
-//===----------------------------------------------------------------------===//
-defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
-
-// DS_CONDXCHG32_RTN_B64
-// DS_CONDXCHG32_RTN_B128
-
-//===----------------------------------------------------------------------===//
-// SMRD Instructions
-//===----------------------------------------------------------------------===//
-
-defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>,
- "s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>;
-
-//===----------------------------------------------------------------------===//
-// MUBUF Instructions
-//===----------------------------------------------------------------------===//
-
-let DisableSIDecoder = 1 in {
-defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>,
- "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol
->;
-}
-
-//===----------------------------------------------------------------------===//
-// Flat Instructions
-//===----------------------------------------------------------------------===//
-
-defm FLAT_LOAD_UBYTE : FLAT_Load_Helper <
- flat<0x8, 0x10>, "flat_load_ubyte", VGPR_32
->;
-defm FLAT_LOAD_SBYTE : FLAT_Load_Helper <
- flat<0x9, 0x11>, "flat_load_sbyte", VGPR_32
->;
-defm FLAT_LOAD_USHORT : FLAT_Load_Helper <
- flat<0xa, 0x12>, "flat_load_ushort", VGPR_32
->;
-defm FLAT_LOAD_SSHORT : FLAT_Load_Helper <
- flat<0xb, 0x13>, "flat_load_sshort", VGPR_32>
-;
-defm FLAT_LOAD_DWORD : FLAT_Load_Helper <
- flat<0xc, 0x14>, "flat_load_dword", VGPR_32
->;
-defm FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <
- flat<0xd, 0x15>, "flat_load_dwordx2", VReg_64
->;
-defm FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <
- flat<0xe, 0x17>, "flat_load_dwordx4", VReg_128
->;
-defm FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <
- flat<0xf, 0x16>, "flat_load_dwordx3", VReg_96
->;
-defm FLAT_STORE_BYTE : FLAT_Store_Helper <
- flat<0x18>, "flat_store_byte", VGPR_32
->;
-defm FLAT_STORE_SHORT : FLAT_Store_Helper <
- flat <0x1a>, "flat_store_short", VGPR_32
->;
-defm FLAT_STORE_DWORD : FLAT_Store_Helper <
- flat<0x1c>, "flat_store_dword", VGPR_32
->;
-defm FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
- flat<0x1d>, "flat_store_dwordx2", VReg_64
->;
-defm FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
- flat<0x1e, 0x1f>, "flat_store_dwordx4", VReg_128
->;
-defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
- flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96
->;
-defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <
- flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32, i32, atomic_swap_flat
->;
-defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC <
- flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, i32,
- atomic_cmp_swap_flat, v2i32, VReg_64
->;
-defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <
- flat<0x32, 0x42>, "flat_atomic_add", VGPR_32, i32, atomic_add_flat
->;
-defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <
- flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32, i32, atomic_sub_flat
->;
-defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <
- flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32, i32, atomic_min_flat
->;
-defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <
- flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32, i32, atomic_umin_flat
->;
-defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <
- flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32, i32, atomic_max_flat
->;
-defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <
- flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32, i32, atomic_umax_flat
->;
-defm FLAT_ATOMIC_AND : FLAT_ATOMIC <
- flat<0x39, 0x48>, "flat_atomic_and", VGPR_32, i32, atomic_and_flat
->;
-defm FLAT_ATOMIC_OR : FLAT_ATOMIC <
- flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32, i32, atomic_or_flat
->;
-defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <
- flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32, i32, atomic_xor_flat
->;
-defm FLAT_ATOMIC_INC : FLAT_ATOMIC <
- flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32, i32, atomic_inc_flat
->;
-defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <
- flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32, i32, atomic_dec_flat
->;
-defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <
- flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64, i64, atomic_swap_flat
->;
-defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC <
- flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, i64,
- atomic_cmp_swap_flat, v2i64, VReg_128
->;
-defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <
- flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64, i64, atomic_add_flat
->;
-defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <
- flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64, i64, atomic_sub_flat
->;
-defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <
- flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64, i64, atomic_min_flat
->;
-defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <
- flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64, i64, atomic_umin_flat
->;
-defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <
- flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64, i64, atomic_max_flat
->;
-defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <
- flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64, i64, atomic_umax_flat
->;
-defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <
- flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64, i64, atomic_and_flat
->;
-defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <
- flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64, i64, atomic_or_flat
->;
-defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <
- flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64, i64, atomic_xor_flat
->;
-defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <
- flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64, i64, atomic_inc_flat
->;
-defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <
- flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64, i64, atomic_dec_flat
->;
-
-} // End SubtargetPredicate = isCIVI
-
-// CI Only flat instructions
-
-let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst, DisableVIDecoder = 1 in {
-
-defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
- flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, f32,
- null_frag, v2f32, VReg_64
->;
-defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <
- flat<0x3f>, "flat_atomic_fmin", VGPR_32, f32
->;
-defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <
- flat<0x40>, "flat_atomic_fmax", VGPR_32, f32
->;
-defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
- flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, f64,
- null_frag, v2f64, VReg_128
->;
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <
- flat<0x5f>, "flat_atomic_fmin_x2", VReg_64, f64
->;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <
- flat<0x60>, "flat_atomic_fmax_x2", VReg_64, f64
->;
-
-} // End SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst, DisableVIDecoder = 1
-
-//===----------------------------------------------------------------------===//
-// Flat Patterns
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isCIVI] in {
-
-// Patterns for global loads with no offset.
-class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- (vt (node i64:$addr)),
- (inst $addr, 0, 0, 0)
->;
-
-class FlatLoadAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- (vt (node i64:$addr)),
- (inst $addr, 1, 0, 0)
->;
-
-def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>;
-
-def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_flat_load, i32>;
-def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_flat_load, i64>;
-
-
-class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- (node vt:$data, i64:$addr),
- (inst $addr, $data, 0, 0, 0)
->;
-
-class FlatStoreAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- // atomic store follows atomic binop convention so the address comes
- // first.
- (node i64:$addr, vt:$data),
- (inst $addr, $data, 1, 0, 0)
->;
-
-def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>;
-def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;
-
-def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_flat_store, i32>;
-def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_flat_store, i64>;
-
-class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt,
- ValueType data_vt = vt> : Pat <
- (vt (node i64:$addr, data_vt:$data)),
- (inst $addr, $data, 0, 0)
->;
-
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, atomic_cmp_swap_global, i32, v2i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
-
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, atomic_cmp_swap_global, i64, v2i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
-
-} // End Predicates = [isCIVI]
+// S_CBRANCH_CDBGSYS_AND_USER \ No newline at end of file
diff --git a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
index 98bc6e8..6b8e85a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
@@ -37,6 +37,9 @@ def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
def MULHI_INT_cm : MULHI_INT_Common<0x90>;
def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
+def MULHI_INT_cm24 : MULHI_INT24_Common<0x5c>;
+def MULHI_UINT_cm24 : MULHI_UINT24_Common<0xb2>;
+
def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;
@@ -85,14 +88,13 @@ def RAT_STORE_TYPED_cm: CF_MEM_RAT_STORE_TYPED<0> {
let eop = 0; // This bit is not used on Cayman.
}
-class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
- : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
+class VTX_READ_cm <string name, dag outs>
+ : VTX_WORD0_cm, VTX_READ<name, outs, []> {
// Static fields
let VC_INST = 0;
let FETCH_TYPE = 2;
let FETCH_WHOLE_QUAD = 0;
- let BUFFER_ID = buffer_id;
let SRC_REL = 0;
// XXX: We can infer this field based on the SRC_GPR. This would allow us
// to store vertex addresses in any channel, not just X.
@@ -105,9 +107,9 @@ class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
let Inst{31-0} = Word0;
}
-class VTX_READ_8_cm <bits<8> buffer_id, list<dag> pattern>
- : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
- (outs R600_TReg32_X:$dst_gpr), pattern> {
+def VTX_READ_8_cm
+ : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr",
+ (outs R600_TReg32_X:$dst_gpr)> {
let DST_SEL_X = 0;
let DST_SEL_Y = 7; // Masked
@@ -116,9 +118,9 @@ class VTX_READ_8_cm <bits<8> buffer_id, list<dag> pattern>
let DATA_FORMAT = 1; // FMT_8
}
-class VTX_READ_16_cm <bits<8> buffer_id, list<dag> pattern>
- : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
- (outs R600_TReg32_X:$dst_gpr), pattern> {
+def VTX_READ_16_cm
+ : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr",
+ (outs R600_TReg32_X:$dst_gpr)> {
let DST_SEL_X = 0;
let DST_SEL_Y = 7; // Masked
let DST_SEL_Z = 7; // Masked
@@ -127,9 +129,9 @@ class VTX_READ_16_cm <bits<8> buffer_id, list<dag> pattern>
}
-class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern>
- : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
- (outs R600_TReg32_X:$dst_gpr), pattern> {
+def VTX_READ_32_cm
+ : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr",
+ (outs R600_TReg32_X:$dst_gpr)> {
let DST_SEL_X = 0;
let DST_SEL_Y = 7; // Masked
@@ -147,9 +149,9 @@ class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern>
let Constraints = "$src_gpr.ptr = $dst_gpr";
}
-class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern>
- : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id,
- (outs R600_Reg64:$dst_gpr), pattern> {
+def VTX_READ_64_cm
+ : VTX_READ_cm <"VTX_READ_64 $dst_gpr.XY, $src_gpr",
+ (outs R600_Reg64:$dst_gpr)> {
let DST_SEL_X = 0;
let DST_SEL_Y = 1;
@@ -158,9 +160,9 @@ class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern>
let DATA_FORMAT = 0x1D; // COLOR_32_32
}
-class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern>
- : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
- (outs R600_Reg128:$dst_gpr), pattern> {
+def VTX_READ_128_cm
+ : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr",
+ (outs R600_Reg128:$dst_gpr)> {
let DST_SEL_X = 0;
let DST_SEL_Y = 1;
@@ -177,79 +179,44 @@ class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern>
//===----------------------------------------------------------------------===//
// VTX Read from parameter memory space
//===----------------------------------------------------------------------===//
-def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0,
- [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0,
- [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0,
- [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0,
- [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
+def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_8_cm MEMxi:$src_gpr, 3)>;
+def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_16_cm MEMxi:$src_gpr, 3)>;
+def : Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_32_cm MEMxi:$src_gpr, 3)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_64_cm MEMxi:$src_gpr, 3)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_128_cm MEMxi:$src_gpr, 3)>;
-def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0,
- [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
+//===----------------------------------------------------------------------===//
+// VTX Read from constant memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_8_cm MEMxi:$src_gpr, 2)>;
+def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_16_cm MEMxi:$src_gpr, 2)>;
+def : Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_32_cm MEMxi:$src_gpr, 2)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_64_cm MEMxi:$src_gpr, 2)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_128_cm MEMxi:$src_gpr, 2)>;
//===----------------------------------------------------------------------===//
// VTX Read from global memory space
//===----------------------------------------------------------------------===//
-
-// 8-bit reads
-def VTX_READ_ID1_8_cm : VTX_READ_8_cm <1,
- [(set i32:$dst_gpr, (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr))]
->;
-
-// 16-bit reads
-def VTX_READ_ID1_16_cm : VTX_READ_16_cm <1,
- [(set i32:$dst_gpr, (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr))]
->;
-
-// 32-bit reads
-def VTX_READ_ID1_32_cm : VTX_READ_32_cm <1,
- [(set i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 64-bit reads
-def VTX_READ_ID1_64_cm : VTX_READ_64_cm <1,
- [(set v2i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 128-bit reads
-def VTX_READ_ID1_128_cm : VTX_READ_128_cm <1,
- [(set v4i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 8-bit reads
-def VTX_READ_ID2_8_cm : VTX_READ_8_cm <2,
- [(set i32:$dst_gpr, (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr))]
->;
-
-// 16-bit reads
-def VTX_READ_ID2_16_cm : VTX_READ_16_cm <2,
- [(set i32:$dst_gpr, (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr))]
->;
-
-// 32-bit reads
-def VTX_READ_ID2_32_cm : VTX_READ_32_cm <2,
- [(set i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 64-bit reads
-def VTX_READ_ID2_64_cm : VTX_READ_64_cm <2,
- [(set v2i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 128-bit reads
-def VTX_READ_ID2_128_cm : VTX_READ_128_cm <2,
- [(set v4i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
->;
+def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_8_cm MEMxi:$src_gpr, 1)>;
+def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_16_cm MEMxi:$src_gpr, 1)>;
+def : Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_32_cm MEMxi:$src_gpr, 1)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_64_cm MEMxi:$src_gpr, 1)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_128_cm MEMxi:$src_gpr, 1)>;
} // End isCayman
diff --git a/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td
new file mode 100644
index 0000000..a077001
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -0,0 +1,906 @@
+//===-- DSInstructions.td - DS Instruction Defintions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> :
+ InstSI <outs, ins, "", pattern>,
+ SIMCInstr <opName, SIEncodingFamily.NONE> {
+
+ let SubtargetPredicate = isGCN;
+
+ let LGKM_CNT = 1;
+ let DS = 1;
+ let Size = 8;
+ let UseNamedOperandTable = 1;
+ let Uses = [M0, EXEC];
+
+ // Most instruction load and store data, so set this as the default.
+ let mayLoad = 1;
+ let mayStore = 1;
+
+ let hasSideEffects = 0;
+ let SchedRW = [WriteLDS];
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+
+ let AsmMatchConverter = "cvtDS";
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ // Well these bits a kind of hack because it would be more natural
+ // to test "outs" and "ins" dags for the presence of particular operands
+ bits<1> has_vdst = 1;
+ bits<1> has_addr = 1;
+ bits<1> has_data0 = 1;
+ bits<1> has_data1 = 1;
+
+ bits<1> has_offset = 1; // has "offset" that should be split to offset0,1
+ bits<1> has_offset0 = 1;
+ bits<1> has_offset1 = 1;
+
+ bits<1> has_gds = 1;
+ bits<1> gdsValue = 0; // if has_gds == 0 set gds to this value
+}
+
+class DS_Real <DS_Pseudo ds> :
+ InstSI <ds.OutOperandList, ds.InOperandList, ds.Mnemonic # " " # ds.AsmOperands, []>,
+ Enc64 {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ds.SubtargetPredicate;
+ let AsmMatchConverter = ds.AsmMatchConverter;
+
+ // encoding fields
+ bits<8> vdst;
+ bits<1> gds;
+ bits<8> addr;
+ bits<8> data0;
+ bits<8> data1;
+ bits<8> offset0;
+ bits<8> offset1;
+
+ bits<16> offset;
+ let offset0 = !if(ds.has_offset, offset{7-0}, ?);
+ let offset1 = !if(ds.has_offset, offset{15-8}, ?);
+}
+
+
+// DS Pseudo instructions
+
+class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs),
+ (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
+ "$addr, $data0$offset$gds">,
+ AtomicNoRet<opName, 0> {
+
+ let has_data1 = 0;
+ let has_vdst = 0;
+}
+
+class DS_1A_Off8_NORET<string opName> : DS_Pseudo<opName,
+ (outs),
+ (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
+ "$addr $offset0$offset1$gds"> {
+
+ let has_data0 = 0;
+ let has_data1 = 0;
+ let has_vdst = 0;
+ let has_offset = 0;
+ let AsmMatchConverter = "cvtDSOffset01";
+}
+
+class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs),
+ (ins VGPR_32:$addr, rc:$data0, rc:$data1, offset:$offset, gds:$gds),
+ "$addr, $data0, $data1"#"$offset"#"$gds">,
+ AtomicNoRet<opName, 0> {
+
+ let has_vdst = 0;
+}
+
+class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs),
+ (ins VGPR_32:$addr, rc:$data0, rc:$data1,
+ offset0:$offset0, offset1:$offset1, gds:$gds),
+ "$addr, $data0, $data1$offset0$offset1$gds"> {
+
+ let has_vdst = 0;
+ let has_offset = 0;
+ let AsmMatchConverter = "cvtDSOffset01";
+}
+
+class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs rc:$vdst),
+ (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
+ "$vdst, $addr, $data0$offset$gds"> {
+
+ let hasPostISelHook = 1;
+ let has_data1 = 0;
+}
+
+class DS_1A2D_RET<string opName,
+ RegisterClass rc = VGPR_32,
+ RegisterClass src = rc>
+: DS_Pseudo<opName,
+ (outs rc:$vdst),
+ (ins VGPR_32:$addr, src:$data0, src:$data1, offset:$offset, gds:$gds),
+ "$vdst, $addr, $data0, $data1$offset$gds"> {
+
+ let hasPostISelHook = 1;
+}
+
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs rc:$vdst),
+ (ins VGPR_32:$addr, offset:$offset, gds:$gds),
+ "$vdst, $addr$offset$gds"> {
+
+ let has_data0 = 0;
+ let has_data1 = 0;
+}
+
+class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs rc:$vdst),
+ (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
+ "$vdst, $addr$offset0$offset1$gds"> {
+
+ let has_offset = 0;
+ let has_data0 = 0;
+ let has_data1 = 0;
+ let AsmMatchConverter = "cvtDSOffset01";
+}
+
+class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
+ (outs VGPR_32:$vdst),
+ (ins VGPR_32:$addr, offset:$offset),
+ "$vdst, $addr$offset gds"> {
+
+ let has_data0 = 0;
+ let has_data1 = 0;
+ let has_gds = 0;
+ let gdsValue = 1;
+}
+
+class DS_0A_RET <string opName> : DS_Pseudo<opName,
+ (outs VGPR_32:$vdst),
+ (ins offset:$offset, gds:$gds),
+ "$vdst$offset$gds"> {
+
+ let mayLoad = 1;
+ let mayStore = 1;
+
+ let has_addr = 0;
+ let has_data0 = 0;
+ let has_data1 = 0;
+}
+
+class DS_1A <string opName> : DS_Pseudo<opName,
+ (outs),
+ (ins VGPR_32:$addr, offset:$offset, gds:$gds),
+ "$addr$offset$gds"> {
+
+ let mayLoad = 1;
+ let mayStore = 1;
+
+ let has_vdst = 0;
+ let has_data0 = 0;
+ let has_data1 = 0;
+}
+
+class DS_1A_GDS <string opName> : DS_Pseudo<opName,
+ (outs),
+ (ins VGPR_32:$addr),
+ "$addr gds"> {
+
+ let has_vdst = 0;
+ let has_data0 = 0;
+ let has_data1 = 0;
+ let has_offset = 0;
+ let has_offset0 = 0;
+ let has_offset1 = 0;
+
+ let has_gds = 0;
+ let gdsValue = 1;
+}
+
+class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
+: DS_Pseudo<opName,
+ (outs VGPR_32:$vdst),
+ (ins VGPR_32:$addr, VGPR_32:$data0, offset:$offset),
+ "$vdst, $addr, $data0$offset",
+ [(set i32:$vdst,
+ (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let isConvergent = 1;
+
+ let has_data1 = 0;
+ let has_gds = 0;
+}
+
+def DS_ADD_U32 : DS_1A1D_NORET<"ds_add_u32">;
+def DS_SUB_U32 : DS_1A1D_NORET<"ds_sub_u32">;
+def DS_RSUB_U32 : DS_1A1D_NORET<"ds_rsub_u32">;
+def DS_INC_U32 : DS_1A1D_NORET<"ds_inc_u32">;
+def DS_DEC_U32 : DS_1A1D_NORET<"ds_dec_u32">;
+def DS_MIN_I32 : DS_1A1D_NORET<"ds_min_i32">;
+def DS_MAX_I32 : DS_1A1D_NORET<"ds_max_i32">;
+def DS_MIN_U32 : DS_1A1D_NORET<"ds_min_u32">;
+def DS_MAX_U32 : DS_1A1D_NORET<"ds_max_u32">;
+def DS_AND_B32 : DS_1A1D_NORET<"ds_and_b32">;
+def DS_OR_B32 : DS_1A1D_NORET<"ds_or_b32">;
+def DS_XOR_B32 : DS_1A1D_NORET<"ds_xor_b32">;
+def DS_ADD_F32 : DS_1A1D_NORET<"ds_add_f32">;
+def DS_MIN_F32 : DS_1A1D_NORET<"ds_min_f32">;
+def DS_MAX_F32 : DS_1A1D_NORET<"ds_max_f32">;
+
+let mayLoad = 0 in {
+def DS_WRITE_B8 : DS_1A1D_NORET<"ds_write_b8">;
+def DS_WRITE_B16 : DS_1A1D_NORET<"ds_write_b16">;
+def DS_WRITE_B32 : DS_1A1D_NORET<"ds_write_b32">;
+def DS_WRITE2_B32 : DS_1A2D_Off8_NORET<"ds_write2_b32">;
+def DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET<"ds_write2st64_b32">;
+}
+
+def DS_MSKOR_B32 : DS_1A2D_NORET<"ds_mskor_b32">;
+def DS_CMPST_B32 : DS_1A2D_NORET<"ds_cmpst_b32">;
+def DS_CMPST_F32 : DS_1A2D_NORET<"ds_cmpst_f32">;
+
+def DS_ADD_U64 : DS_1A1D_NORET<"ds_add_u64", VReg_64>;
+def DS_SUB_U64 : DS_1A1D_NORET<"ds_sub_u64", VReg_64>;
+def DS_RSUB_U64 : DS_1A1D_NORET<"ds_rsub_u64", VReg_64>;
+def DS_INC_U64 : DS_1A1D_NORET<"ds_inc_u64", VReg_64>;
+def DS_DEC_U64 : DS_1A1D_NORET<"ds_dec_u64", VReg_64>;
+def DS_MIN_I64 : DS_1A1D_NORET<"ds_min_i64", VReg_64>;
+def DS_MAX_I64 : DS_1A1D_NORET<"ds_max_i64", VReg_64>;
+def DS_MIN_U64 : DS_1A1D_NORET<"ds_min_u64", VReg_64>;
+def DS_MAX_U64 : DS_1A1D_NORET<"ds_max_u64", VReg_64>;
+def DS_AND_B64 : DS_1A1D_NORET<"ds_and_b64", VReg_64>;
+def DS_OR_B64 : DS_1A1D_NORET<"ds_or_b64", VReg_64>;
+def DS_XOR_B64 : DS_1A1D_NORET<"ds_xor_b64", VReg_64>;
+def DS_MSKOR_B64 : DS_1A2D_NORET<"ds_mskor_b64", VReg_64>;
+let mayLoad = 0 in {
+def DS_WRITE_B64 : DS_1A1D_NORET<"ds_write_b64", VReg_64>;
+def DS_WRITE2_B64 : DS_1A2D_Off8_NORET<"ds_write2_b64", VReg_64>;
+def DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET<"ds_write2st64_b64", VReg_64>;
+}
+def DS_CMPST_B64 : DS_1A2D_NORET<"ds_cmpst_b64", VReg_64>;
+def DS_CMPST_F64 : DS_1A2D_NORET<"ds_cmpst_f64", VReg_64>;
+def DS_MIN_F64 : DS_1A1D_NORET<"ds_min_f64", VReg_64>;
+def DS_MAX_F64 : DS_1A1D_NORET<"ds_max_f64", VReg_64>;
+
+def DS_ADD_RTN_U32 : DS_1A1D_RET<"ds_add_rtn_u32">,
+ AtomicNoRet<"ds_add_u32", 1>;
+def DS_ADD_RTN_F32 : DS_1A1D_RET<"ds_add_rtn_f32">,
+ AtomicNoRet<"ds_add_f32", 1>;
+def DS_SUB_RTN_U32 : DS_1A1D_RET<"ds_sub_rtn_u32">,
+ AtomicNoRet<"ds_sub_u32", 1>;
+def DS_RSUB_RTN_U32 : DS_1A1D_RET<"ds_rsub_rtn_u32">,
+ AtomicNoRet<"ds_rsub_u32", 1>;
+def DS_INC_RTN_U32 : DS_1A1D_RET<"ds_inc_rtn_u32">,
+ AtomicNoRet<"ds_inc_u32", 1>;
+def DS_DEC_RTN_U32 : DS_1A1D_RET<"ds_dec_rtn_u32">,
+ AtomicNoRet<"ds_dec_u32", 1>;
+def DS_MIN_RTN_I32 : DS_1A1D_RET<"ds_min_rtn_i32">,
+ AtomicNoRet<"ds_min_i32", 1>;
+def DS_MAX_RTN_I32 : DS_1A1D_RET<"ds_max_rtn_i32">,
+ AtomicNoRet<"ds_max_i32", 1>;
+def DS_MIN_RTN_U32 : DS_1A1D_RET<"ds_min_rtn_u32">,
+ AtomicNoRet<"ds_min_u32", 1>;
+def DS_MAX_RTN_U32 : DS_1A1D_RET<"ds_max_rtn_u32">,
+ AtomicNoRet<"ds_max_u32", 1>;
+def DS_AND_RTN_B32 : DS_1A1D_RET<"ds_and_rtn_b32">,
+ AtomicNoRet<"ds_and_b32", 1>;
+def DS_OR_RTN_B32 : DS_1A1D_RET<"ds_or_rtn_b32">,
+ AtomicNoRet<"ds_or_b32", 1>;
+def DS_XOR_RTN_B32 : DS_1A1D_RET<"ds_xor_rtn_b32">,
+ AtomicNoRet<"ds_xor_b32", 1>;
+def DS_MSKOR_RTN_B32 : DS_1A2D_RET<"ds_mskor_rtn_b32">,
+ AtomicNoRet<"ds_mskor_b32", 1>;
+def DS_CMPST_RTN_B32 : DS_1A2D_RET <"ds_cmpst_rtn_b32">,
+ AtomicNoRet<"ds_cmpst_b32", 1>;
+def DS_CMPST_RTN_F32 : DS_1A2D_RET <"ds_cmpst_rtn_f32">,
+ AtomicNoRet<"ds_cmpst_f32", 1>;
+def DS_MIN_RTN_F32 : DS_1A1D_RET <"ds_min_rtn_f32">,
+ AtomicNoRet<"ds_min_f32", 1>;
+def DS_MAX_RTN_F32 : DS_1A1D_RET <"ds_max_rtn_f32">,
+ AtomicNoRet<"ds_max_f32", 1>;
+
+def DS_WRXCHG_RTN_B32 : DS_1A1D_RET<"ds_wrxchg_rtn_b32">,
+ AtomicNoRet<"", 1>;
+def DS_WRXCHG2_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>,
+ AtomicNoRet<"", 1>;
+def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>,
+ AtomicNoRet<"", 1>;
+
+def DS_ADD_RTN_U64 : DS_1A1D_RET<"ds_add_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_add_u64", 1>;
+def DS_SUB_RTN_U64 : DS_1A1D_RET<"ds_sub_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_sub_u64", 1>;
+def DS_RSUB_RTN_U64 : DS_1A1D_RET<"ds_rsub_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_rsub_u64", 1>;
+def DS_INC_RTN_U64 : DS_1A1D_RET<"ds_inc_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_inc_u64", 1>;
+def DS_DEC_RTN_U64 : DS_1A1D_RET<"ds_dec_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_dec_u64", 1>;
+def DS_MIN_RTN_I64 : DS_1A1D_RET<"ds_min_rtn_i64", VReg_64>,
+ AtomicNoRet<"ds_min_i64", 1>;
+def DS_MAX_RTN_I64 : DS_1A1D_RET<"ds_max_rtn_i64", VReg_64>,
+ AtomicNoRet<"ds_max_i64", 1>;
+def DS_MIN_RTN_U64 : DS_1A1D_RET<"ds_min_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_min_u64", 1>;
+def DS_MAX_RTN_U64 : DS_1A1D_RET<"ds_max_rtn_u64", VReg_64>,
+ AtomicNoRet<"ds_max_u64", 1>;
+def DS_AND_RTN_B64 : DS_1A1D_RET<"ds_and_rtn_b64", VReg_64>,
+ AtomicNoRet<"ds_and_b64", 1>;
+def DS_OR_RTN_B64 : DS_1A1D_RET<"ds_or_rtn_b64", VReg_64>,
+ AtomicNoRet<"ds_or_b64", 1>;
+def DS_XOR_RTN_B64 : DS_1A1D_RET<"ds_xor_rtn_b64", VReg_64>,
+ AtomicNoRet<"ds_xor_b64", 1>;
+def DS_MSKOR_RTN_B64 : DS_1A2D_RET<"ds_mskor_rtn_b64", VReg_64>,
+ AtomicNoRet<"ds_mskor_b64", 1>;
+def DS_CMPST_RTN_B64 : DS_1A2D_RET<"ds_cmpst_rtn_b64", VReg_64>,
+ AtomicNoRet<"ds_cmpst_b64", 1>;
+def DS_CMPST_RTN_F64 : DS_1A2D_RET<"ds_cmpst_rtn_f64", VReg_64>,
+ AtomicNoRet<"ds_cmpst_f64", 1>;
+def DS_MIN_RTN_F64 : DS_1A1D_RET<"ds_min_rtn_f64", VReg_64>,
+ AtomicNoRet<"ds_min_f64", 1>;
+def DS_MAX_RTN_F64 : DS_1A1D_RET<"ds_max_rtn_f64", VReg_64>,
+ AtomicNoRet<"ds_max_f64", 1>;
+
+def DS_WRXCHG_RTN_B64 : DS_1A1D_RET<"ds_wrxchg_rtn_b64", VReg_64>,
+ AtomicNoRet<"ds_wrxchg_b64", 1>;
+def DS_WRXCHG2_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>,
+ AtomicNoRet<"ds_wrxchg2_b64", 1>;
+def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>,
+ AtomicNoRet<"ds_wrxchg2st64_b64", 1>;
+
+def DS_GWS_INIT : DS_1A_GDS<"ds_gws_init">;
+def DS_GWS_SEMA_V : DS_1A_GDS<"ds_gws_sema_v">;
+def DS_GWS_SEMA_BR : DS_1A_GDS<"ds_gws_sema_br">;
+def DS_GWS_SEMA_P : DS_1A_GDS<"ds_gws_sema_p">;
+def DS_GWS_BARRIER : DS_1A_GDS<"ds_gws_barrier">;
+
+def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">;
+def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">;
+def DS_RSUB_SRC2_U32 : DS_1A<"ds_rsub_src2_u32">;
+def DS_INC_SRC2_U32 : DS_1A<"ds_inc_src2_u32">;
+def DS_DEC_SRC2_U32 : DS_1A<"ds_dec_src2_u32">;
+def DS_MIN_SRC2_I32 : DS_1A<"ds_min_src2_i32">;
+def DS_MAX_SRC2_I32 : DS_1A<"ds_max_src2_i32">;
+def DS_MIN_SRC2_U32 : DS_1A<"ds_min_src2_u32">;
+def DS_MAX_SRC2_U32 : DS_1A<"ds_max_src2_u32">;
+def DS_AND_SRC2_B32 : DS_1A<"ds_and_src_b32">;
+def DS_OR_SRC2_B32 : DS_1A<"ds_or_src2_b32">;
+def DS_XOR_SRC2_B32 : DS_1A<"ds_xor_src2_b32">;
+def DS_MIN_SRC2_F32 : DS_1A<"ds_min_src2_f32">;
+def DS_MAX_SRC2_F32 : DS_1A<"ds_max_src2_f32">;
+
+def DS_ADD_SRC2_U64 : DS_1A<"ds_add_src2_u64">;
+def DS_SUB_SRC2_U64 : DS_1A<"ds_sub_src2_u64">;
+def DS_RSUB_SRC2_U64 : DS_1A<"ds_rsub_src2_u64">;
+def DS_INC_SRC2_U64 : DS_1A<"ds_inc_src2_u64">;
+def DS_DEC_SRC2_U64 : DS_1A<"ds_dec_src2_u64">;
+def DS_MIN_SRC2_I64 : DS_1A<"ds_min_src2_i64">;
+def DS_MAX_SRC2_I64 : DS_1A<"ds_max_src2_i64">;
+def DS_MIN_SRC2_U64 : DS_1A<"ds_min_src2_u64">;
+def DS_MAX_SRC2_U64 : DS_1A<"ds_max_src2_u64">;
+def DS_AND_SRC2_B64 : DS_1A<"ds_and_src2_b64">;
+def DS_OR_SRC2_B64 : DS_1A<"ds_or_src2_b64">;
+def DS_XOR_SRC2_B64 : DS_1A<"ds_xor_src2_b64">;
+def DS_MIN_SRC2_F64 : DS_1A<"ds_min_src2_f64">;
+def DS_MAX_SRC2_F64 : DS_1A<"ds_max_src2_f64">;
+
+def DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET<"ds_write_src2_b32">;
+def DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET<"ds_write_src2_b64">;
+
+let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">;
+}
+
+let mayStore = 0 in {
+def DS_READ_I8 : DS_1A_RET<"ds_read_i8">;
+def DS_READ_U8 : DS_1A_RET<"ds_read_u8">;
+def DS_READ_I16 : DS_1A_RET<"ds_read_i16">;
+def DS_READ_U16 : DS_1A_RET<"ds_read_u16">;
+def DS_READ_B32 : DS_1A_RET<"ds_read_b32">;
+def DS_READ_B64 : DS_1A_RET<"ds_read_b64", VReg_64>;
+
+def DS_READ2_B32 : DS_1A_Off8_RET<"ds_read2_b32", VReg_64>;
+def DS_READ2ST64_B32 : DS_1A_Off8_RET<"ds_read2st64_b32", VReg_64>;
+
+def DS_READ2_B64 : DS_1A_Off8_RET<"ds_read2_b64", VReg_128>;
+def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>;
+}
+
+let SubtargetPredicate = isSICI in {
+def DS_CONSUME : DS_0A_RET<"ds_consume">;
+def DS_APPEND : DS_0A_RET<"ds_append">;
+def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for CI and newer.
+//===----------------------------------------------------------------------===//
+// Remaining instructions:
+// DS_NOP
+// DS_GWS_SEMA_RELEASE_ALL
+// DS_WRAP_RTN_B32
+// DS_CNDXCHG32_RTN_B64
+// DS_WRITE_B96
+// DS_WRITE_B128
+// DS_CONDXCHG32_RTN_B128
+// DS_READ_B96
+// DS_READ_B128
+
+let SubtargetPredicate = isCIVI in {
+
+def DS_WRAP_RTN_F32 : DS_1A1D_RET <"ds_wrap_rtn_f32">,
+ AtomicNoRet<"ds_wrap_f32", 1>;
+
+} // let SubtargetPredicate = isCIVI
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for VI and newer.
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isVI in {
+
+let Uses = [EXEC] in {
+def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32",
+ int_amdgcn_ds_permute>;
+def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
+ int_amdgcn_ds_bpermute>;
+}
+
+} // let SubtargetPredicate = isVI
+
+//===----------------------------------------------------------------------===//
+// DS Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isGCN] in {
+
+def : Pat <
+ (int_amdgcn_ds_swizzle i32:$src, imm:$offset16),
+ (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
+>;
+
+class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+ (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
+ (inst $ptr, (as_i16imm $offset), (i1 0))
+>;
+
+def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>;
+def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>;
+def : DSReadPat <DS_READ_I8, i16, si_sextload_local_i8>;
+def : DSReadPat <DS_READ_U8, i16, si_az_extload_local_i8>;
+def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
+def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
+def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>;
+def : DSReadPat <DS_READ_U16, i16, si_load_local>;
+def : DSReadPat <DS_READ_B32, i32, si_load_local>;
+
+let AddedComplexity = 100 in {
+
+def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>;
+
+} // End AddedComplexity = 100
+
+def : Pat <
+ (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+ i8:$offset1))),
+ (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
+>;
+
+class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+ (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
+ (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+>;
+
+def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>;
+def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>;
+def : DSWritePat <DS_WRITE_B8, i16, si_truncstore_local_i8>;
+def : DSWritePat <DS_WRITE_B16, i16, si_store_local>;
+def : DSWritePat <DS_WRITE_B32, i32, si_store_local>;
+
+let AddedComplexity = 100 in {
+
+def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>;
+} // End AddedComplexity = 100
+
+def : Pat <
+ (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+ i8:$offset1)),
+ (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
+ (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
+ (i1 0))
+>;
+
+class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
+ (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+>;
+
+class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
+ (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
+>;
+
+
+// 32-bit atomics.
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_INC_RTN_U32, i32, si_atomic_inc_local>;
+def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, si_atomic_dec_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>;
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
+
+// 64-bit atomics.
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_INC_RTN_U64, i64, si_atomic_inc_local>;
+def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, si_atomic_dec_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>;
+
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>;
+
+} // let Predicates = [isGCN]
+
+//===----------------------------------------------------------------------===//
+// Real instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SIInstructions.td
+//===----------------------------------------------------------------------===//
+
+class DS_Real_si <bits<8> op, DS_Pseudo ds> :
+ DS_Real <ds>,
+ SIMCInstr <ds.Mnemonic, SIEncodingFamily.SI> {
+ let AssemblerPredicates=[isSICI];
+ let DecoderNamespace="SICI";
+
+ // encoding
+ let Inst{7-0} = !if(ds.has_offset0, offset0, 0);
+ let Inst{15-8} = !if(ds.has_offset1, offset1, 0);
+ let Inst{17} = !if(ds.has_gds, gds, ds.gdsValue);
+ let Inst{25-18} = op;
+ let Inst{31-26} = 0x36; // ds prefix
+ let Inst{39-32} = !if(ds.has_addr, addr, 0);
+ let Inst{47-40} = !if(ds.has_data0, data0, 0);
+ let Inst{55-48} = !if(ds.has_data1, data1, 0);
+ let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
+}
+
+def DS_ADD_U32_si : DS_Real_si<0x0, DS_ADD_U32>;
+def DS_SUB_U32_si : DS_Real_si<0x1, DS_SUB_U32>;
+def DS_RSUB_U32_si : DS_Real_si<0x2, DS_RSUB_U32>;
+def DS_INC_U32_si : DS_Real_si<0x3, DS_INC_U32>;
+def DS_DEC_U32_si : DS_Real_si<0x4, DS_DEC_U32>;
+def DS_MIN_I32_si : DS_Real_si<0x5, DS_MIN_I32>;
+def DS_MAX_I32_si : DS_Real_si<0x6, DS_MAX_I32>;
+def DS_MIN_U32_si : DS_Real_si<0x7, DS_MIN_U32>;
+def DS_MAX_U32_si : DS_Real_si<0x8, DS_MAX_U32>;
+def DS_AND_B32_si : DS_Real_si<0x9, DS_AND_B32>;
+def DS_OR_B32_si : DS_Real_si<0xa, DS_OR_B32>;
+def DS_XOR_B32_si : DS_Real_si<0xb, DS_XOR_B32>;
+def DS_MSKOR_B32_si : DS_Real_si<0xc, DS_MSKOR_B32>;
+def DS_WRITE_B32_si : DS_Real_si<0xd, DS_WRITE_B32>;
+def DS_WRITE2_B32_si : DS_Real_si<0xe, DS_WRITE2_B32>;
+def DS_WRITE2ST64_B32_si : DS_Real_si<0xf, DS_WRITE2ST64_B32>;
+def DS_CMPST_B32_si : DS_Real_si<0x10, DS_CMPST_B32>;
+def DS_CMPST_F32_si : DS_Real_si<0x11, DS_CMPST_F32>;
+def DS_MIN_F32_si : DS_Real_si<0x12, DS_MIN_F32>;
+def DS_MAX_F32_si : DS_Real_si<0x13, DS_MAX_F32>;
+def DS_GWS_INIT_si : DS_Real_si<0x19, DS_GWS_INIT>;
+def DS_GWS_SEMA_V_si : DS_Real_si<0x1a, DS_GWS_SEMA_V>;
+def DS_GWS_SEMA_BR_si : DS_Real_si<0x1b, DS_GWS_SEMA_BR>;
+def DS_GWS_SEMA_P_si : DS_Real_si<0x1c, DS_GWS_SEMA_P>;
+def DS_GWS_BARRIER_si : DS_Real_si<0x1d, DS_GWS_BARRIER>;
+def DS_WRITE_B8_si : DS_Real_si<0x1e, DS_WRITE_B8>;
+def DS_WRITE_B16_si : DS_Real_si<0x1f, DS_WRITE_B16>;
+def DS_ADD_RTN_U32_si : DS_Real_si<0x20, DS_ADD_RTN_U32>;
+def DS_SUB_RTN_U32_si : DS_Real_si<0x21, DS_SUB_RTN_U32>;
+def DS_RSUB_RTN_U32_si : DS_Real_si<0x22, DS_RSUB_RTN_U32>;
+def DS_INC_RTN_U32_si : DS_Real_si<0x23, DS_INC_RTN_U32>;
+def DS_DEC_RTN_U32_si : DS_Real_si<0x24, DS_DEC_RTN_U32>;
+def DS_MIN_RTN_I32_si : DS_Real_si<0x25, DS_MIN_RTN_I32>;
+def DS_MAX_RTN_I32_si : DS_Real_si<0x26, DS_MAX_RTN_I32>;
+def DS_MIN_RTN_U32_si : DS_Real_si<0x27, DS_MIN_RTN_U32>;
+def DS_MAX_RTN_U32_si : DS_Real_si<0x28, DS_MAX_RTN_U32>;
+def DS_AND_RTN_B32_si : DS_Real_si<0x29, DS_AND_RTN_B32>;
+def DS_OR_RTN_B32_si : DS_Real_si<0x2a, DS_OR_RTN_B32>;
+def DS_XOR_RTN_B32_si : DS_Real_si<0x2b, DS_XOR_RTN_B32>;
+def DS_MSKOR_RTN_B32_si : DS_Real_si<0x2c, DS_MSKOR_RTN_B32>;
+def DS_WRXCHG_RTN_B32_si : DS_Real_si<0x2d, DS_WRXCHG_RTN_B32>;
+def DS_WRXCHG2_RTN_B32_si : DS_Real_si<0x2e, DS_WRXCHG2_RTN_B32>;
+def DS_WRXCHG2ST64_RTN_B32_si : DS_Real_si<0x2f, DS_WRXCHG2ST64_RTN_B32>;
+def DS_CMPST_RTN_B32_si : DS_Real_si<0x30, DS_CMPST_RTN_B32>;
+def DS_CMPST_RTN_F32_si : DS_Real_si<0x31, DS_CMPST_RTN_F32>;
+def DS_MIN_RTN_F32_si : DS_Real_si<0x32, DS_MIN_RTN_F32>;
+def DS_MAX_RTN_F32_si : DS_Real_si<0x33, DS_MAX_RTN_F32>;
+
+// FIXME: this instruction is actually CI/VI
+def DS_WRAP_RTN_F32_si : DS_Real_si<0x34, DS_WRAP_RTN_F32>;
+
+def DS_SWIZZLE_B32_si : DS_Real_si<0x35, DS_SWIZZLE_B32>;
+def DS_READ_B32_si : DS_Real_si<0x36, DS_READ_B32>;
+def DS_READ2_B32_si : DS_Real_si<0x37, DS_READ2_B32>;
+def DS_READ2ST64_B32_si : DS_Real_si<0x38, DS_READ2ST64_B32>;
+def DS_READ_I8_si : DS_Real_si<0x39, DS_READ_I8>;
+def DS_READ_U8_si : DS_Real_si<0x3a, DS_READ_U8>;
+def DS_READ_I16_si : DS_Real_si<0x3b, DS_READ_I16>;
+def DS_READ_U16_si : DS_Real_si<0x3c, DS_READ_U16>;
+def DS_CONSUME_si : DS_Real_si<0x3d, DS_CONSUME>;
+def DS_APPEND_si : DS_Real_si<0x3e, DS_APPEND>;
+def DS_ORDERED_COUNT_si : DS_Real_si<0x3f, DS_ORDERED_COUNT>;
+def DS_ADD_U64_si : DS_Real_si<0x40, DS_ADD_U64>;
+def DS_SUB_U64_si : DS_Real_si<0x41, DS_SUB_U64>;
+def DS_RSUB_U64_si : DS_Real_si<0x42, DS_RSUB_U64>;
+def DS_INC_U64_si : DS_Real_si<0x43, DS_INC_U64>;
+def DS_DEC_U64_si : DS_Real_si<0x44, DS_DEC_U64>;
+def DS_MIN_I64_si : DS_Real_si<0x45, DS_MIN_I64>;
+def DS_MAX_I64_si : DS_Real_si<0x46, DS_MAX_I64>;
+def DS_MIN_U64_si : DS_Real_si<0x47, DS_MIN_U64>;
+def DS_MAX_U64_si : DS_Real_si<0x48, DS_MAX_U64>;
+def DS_AND_B64_si : DS_Real_si<0x49, DS_AND_B64>;
+def DS_OR_B64_si : DS_Real_si<0x4a, DS_OR_B64>;
+def DS_XOR_B64_si : DS_Real_si<0x4b, DS_XOR_B64>;
+def DS_MSKOR_B64_si : DS_Real_si<0x4c, DS_MSKOR_B64>;
+def DS_WRITE_B64_si : DS_Real_si<0x4d, DS_WRITE_B64>;
+def DS_WRITE2_B64_si : DS_Real_si<0x4E, DS_WRITE2_B64>;
+def DS_WRITE2ST64_B64_si : DS_Real_si<0x4f, DS_WRITE2ST64_B64>;
+def DS_CMPST_B64_si : DS_Real_si<0x50, DS_CMPST_B64>;
+def DS_CMPST_F64_si : DS_Real_si<0x51, DS_CMPST_F64>;
+def DS_MIN_F64_si : DS_Real_si<0x52, DS_MIN_F64>;
+def DS_MAX_F64_si : DS_Real_si<0x53, DS_MAX_F64>;
+
+def DS_ADD_RTN_U64_si : DS_Real_si<0x60, DS_ADD_RTN_U64>;
+def DS_SUB_RTN_U64_si : DS_Real_si<0x61, DS_SUB_RTN_U64>;
+def DS_RSUB_RTN_U64_si : DS_Real_si<0x62, DS_RSUB_RTN_U64>;
+def DS_INC_RTN_U64_si : DS_Real_si<0x63, DS_INC_RTN_U64>;
+def DS_DEC_RTN_U64_si : DS_Real_si<0x64, DS_DEC_RTN_U64>;
+def DS_MIN_RTN_I64_si : DS_Real_si<0x65, DS_MIN_RTN_I64>;
+def DS_MAX_RTN_I64_si : DS_Real_si<0x66, DS_MAX_RTN_I64>;
+def DS_MIN_RTN_U64_si : DS_Real_si<0x67, DS_MIN_RTN_U64>;
+def DS_MAX_RTN_U64_si : DS_Real_si<0x68, DS_MAX_RTN_U64>;
+def DS_AND_RTN_B64_si : DS_Real_si<0x69, DS_AND_RTN_B64>;
+def DS_OR_RTN_B64_si : DS_Real_si<0x6a, DS_OR_RTN_B64>;
+def DS_XOR_RTN_B64_si : DS_Real_si<0x6b, DS_XOR_RTN_B64>;
+def DS_MSKOR_RTN_B64_si : DS_Real_si<0x6c, DS_MSKOR_RTN_B64>;
+def DS_WRXCHG_RTN_B64_si : DS_Real_si<0x6d, DS_WRXCHG_RTN_B64>;
+def DS_WRXCHG2_RTN_B64_si : DS_Real_si<0x6e, DS_WRXCHG2_RTN_B64>;
+def DS_WRXCHG2ST64_RTN_B64_si : DS_Real_si<0x6f, DS_WRXCHG2ST64_RTN_B64>;
+def DS_CMPST_RTN_B64_si : DS_Real_si<0x70, DS_CMPST_RTN_B64>;
+def DS_CMPST_RTN_F64_si : DS_Real_si<0x71, DS_CMPST_RTN_F64>;
+def DS_MIN_RTN_F64_si : DS_Real_si<0x72, DS_MIN_RTN_F64>;
+def DS_MAX_RTN_F64_si : DS_Real_si<0x73, DS_MAX_RTN_F64>;
+
+def DS_READ_B64_si : DS_Real_si<0x76, DS_READ_B64>;
+def DS_READ2_B64_si : DS_Real_si<0x77, DS_READ2_B64>;
+def DS_READ2ST64_B64_si : DS_Real_si<0x78, DS_READ2ST64_B64>;
+
+def DS_ADD_SRC2_U32_si : DS_Real_si<0x80, DS_ADD_SRC2_U32>;
+def DS_SUB_SRC2_U32_si : DS_Real_si<0x81, DS_SUB_SRC2_U32>;
+def DS_RSUB_SRC2_U32_si : DS_Real_si<0x82, DS_RSUB_SRC2_U32>;
+def DS_INC_SRC2_U32_si : DS_Real_si<0x83, DS_INC_SRC2_U32>;
+def DS_DEC_SRC2_U32_si : DS_Real_si<0x84, DS_DEC_SRC2_U32>;
+def DS_MIN_SRC2_I32_si : DS_Real_si<0x85, DS_MIN_SRC2_I32>;
+def DS_MAX_SRC2_I32_si : DS_Real_si<0x86, DS_MAX_SRC2_I32>;
+def DS_MIN_SRC2_U32_si : DS_Real_si<0x87, DS_MIN_SRC2_U32>;
+def DS_MAX_SRC2_U32_si : DS_Real_si<0x88, DS_MAX_SRC2_U32>;
+def DS_AND_SRC2_B32_si : DS_Real_si<0x89, DS_AND_SRC2_B32>;
+def DS_OR_SRC2_B32_si : DS_Real_si<0x8a, DS_OR_SRC2_B32>;
+def DS_XOR_SRC2_B32_si : DS_Real_si<0x8b, DS_XOR_SRC2_B32>;
+def DS_WRITE_SRC2_B32_si : DS_Real_si<0x8d, DS_WRITE_SRC2_B32>;
+
+def DS_MIN_SRC2_F32_si : DS_Real_si<0x92, DS_MIN_SRC2_F32>;
+def DS_MAX_SRC2_F32_si : DS_Real_si<0x93, DS_MAX_SRC2_F32>;
+
+def DS_ADD_SRC2_U64_si : DS_Real_si<0xc0, DS_ADD_SRC2_U64>;
+def DS_SUB_SRC2_U64_si : DS_Real_si<0xc1, DS_SUB_SRC2_U64>;
+def DS_RSUB_SRC2_U64_si : DS_Real_si<0xc2, DS_RSUB_SRC2_U64>;
+def DS_INC_SRC2_U64_si : DS_Real_si<0xc3, DS_INC_SRC2_U64>;
+def DS_DEC_SRC2_U64_si : DS_Real_si<0xc4, DS_DEC_SRC2_U64>;
+def DS_MIN_SRC2_I64_si : DS_Real_si<0xc5, DS_MIN_SRC2_I64>;
+def DS_MAX_SRC2_I64_si : DS_Real_si<0xc6, DS_MAX_SRC2_I64>;
+def DS_MIN_SRC2_U64_si : DS_Real_si<0xc7, DS_MIN_SRC2_U64>;
+def DS_MAX_SRC2_U64_si : DS_Real_si<0xc8, DS_MAX_SRC2_U64>;
+def DS_AND_SRC2_B64_si : DS_Real_si<0xc9, DS_AND_SRC2_B64>;
+def DS_OR_SRC2_B64_si : DS_Real_si<0xca, DS_OR_SRC2_B64>;
+def DS_XOR_SRC2_B64_si : DS_Real_si<0xcb, DS_XOR_SRC2_B64>;
+def DS_WRITE_SRC2_B64_si : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>;
+
+def DS_MIN_SRC2_F64_si : DS_Real_si<0xd2, DS_MIN_SRC2_F64>;
+def DS_MAX_SRC2_F64_si : DS_Real_si<0xd3, DS_MAX_SRC2_F64>;
+
+//===----------------------------------------------------------------------===//
+// VIInstructions.td
+//===----------------------------------------------------------------------===//
+
+class DS_Real_vi <bits<8> op, DS_Pseudo ds> :
+ DS_Real <ds>,
+ SIMCInstr <ds.Mnemonic, SIEncodingFamily.VI> {
+ let AssemblerPredicates = [isVI];
+ let DecoderNamespace="VI";
+
+ // encoding
+ let Inst{7-0} = !if(ds.has_offset0, offset0, 0);
+ let Inst{15-8} = !if(ds.has_offset1, offset1, 0);
+ let Inst{16} = !if(ds.has_gds, gds, ds.gdsValue);
+ let Inst{24-17} = op;
+ let Inst{31-26} = 0x36; // ds prefix
+ let Inst{39-32} = !if(ds.has_addr, addr, 0);
+ let Inst{47-40} = !if(ds.has_data0, data0, 0);
+ let Inst{55-48} = !if(ds.has_data1, data1, 0);
+ let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
+}
+
+def DS_ADD_U32_vi : DS_Real_vi<0x0, DS_ADD_U32>;
+def DS_SUB_U32_vi : DS_Real_vi<0x1, DS_SUB_U32>;
+def DS_RSUB_U32_vi : DS_Real_vi<0x2, DS_RSUB_U32>;
+def DS_INC_U32_vi : DS_Real_vi<0x3, DS_INC_U32>;
+def DS_DEC_U32_vi : DS_Real_vi<0x4, DS_DEC_U32>;
+def DS_MIN_I32_vi : DS_Real_vi<0x5, DS_MIN_I32>;
+def DS_MAX_I32_vi : DS_Real_vi<0x6, DS_MAX_I32>;
+def DS_MIN_U32_vi : DS_Real_vi<0x7, DS_MIN_U32>;
+def DS_MAX_U32_vi : DS_Real_vi<0x8, DS_MAX_U32>;
+def DS_AND_B32_vi : DS_Real_vi<0x9, DS_AND_B32>;
+def DS_OR_B32_vi : DS_Real_vi<0xa, DS_OR_B32>;
+def DS_XOR_B32_vi : DS_Real_vi<0xb, DS_XOR_B32>;
+def DS_MSKOR_B32_vi : DS_Real_vi<0xc, DS_MSKOR_B32>;
+def DS_WRITE_B32_vi : DS_Real_vi<0xd, DS_WRITE_B32>;
+def DS_WRITE2_B32_vi : DS_Real_vi<0xe, DS_WRITE2_B32>;
+def DS_WRITE2ST64_B32_vi : DS_Real_vi<0xf, DS_WRITE2ST64_B32>;
+def DS_CMPST_B32_vi : DS_Real_vi<0x10, DS_CMPST_B32>;
+def DS_CMPST_F32_vi : DS_Real_vi<0x11, DS_CMPST_F32>;
+def DS_MIN_F32_vi : DS_Real_vi<0x12, DS_MIN_F32>;
+def DS_MAX_F32_vi : DS_Real_vi<0x13, DS_MAX_F32>;
+def DS_ADD_F32_vi : DS_Real_vi<0x15, DS_ADD_F32>;
+def DS_GWS_INIT_vi : DS_Real_vi<0x19, DS_GWS_INIT>;
+def DS_GWS_SEMA_V_vi : DS_Real_vi<0x1a, DS_GWS_SEMA_V>;
+def DS_GWS_SEMA_BR_vi : DS_Real_vi<0x1b, DS_GWS_SEMA_BR>;
+def DS_GWS_SEMA_P_vi : DS_Real_vi<0x1c, DS_GWS_SEMA_P>;
+def DS_GWS_BARRIER_vi : DS_Real_vi<0x1d, DS_GWS_BARRIER>;
+def DS_WRITE_B8_vi : DS_Real_vi<0x1e, DS_WRITE_B8>;
+def DS_WRITE_B16_vi : DS_Real_vi<0x1f, DS_WRITE_B16>;
+def DS_ADD_RTN_U32_vi : DS_Real_vi<0x20, DS_ADD_RTN_U32>;
+def DS_SUB_RTN_U32_vi : DS_Real_vi<0x21, DS_SUB_RTN_U32>;
+def DS_RSUB_RTN_U32_vi : DS_Real_vi<0x22, DS_RSUB_RTN_U32>;
+def DS_INC_RTN_U32_vi : DS_Real_vi<0x23, DS_INC_RTN_U32>;
+def DS_DEC_RTN_U32_vi : DS_Real_vi<0x24, DS_DEC_RTN_U32>;
+def DS_MIN_RTN_I32_vi : DS_Real_vi<0x25, DS_MIN_RTN_I32>;
+def DS_MAX_RTN_I32_vi : DS_Real_vi<0x26, DS_MAX_RTN_I32>;
+def DS_MIN_RTN_U32_vi : DS_Real_vi<0x27, DS_MIN_RTN_U32>;
+def DS_MAX_RTN_U32_vi : DS_Real_vi<0x28, DS_MAX_RTN_U32>;
+def DS_AND_RTN_B32_vi : DS_Real_vi<0x29, DS_AND_RTN_B32>;
+def DS_OR_RTN_B32_vi : DS_Real_vi<0x2a, DS_OR_RTN_B32>;
+def DS_XOR_RTN_B32_vi : DS_Real_vi<0x2b, DS_XOR_RTN_B32>;
+def DS_MSKOR_RTN_B32_vi : DS_Real_vi<0x2c, DS_MSKOR_RTN_B32>;
+def DS_WRXCHG_RTN_B32_vi : DS_Real_vi<0x2d, DS_WRXCHG_RTN_B32>;
+def DS_WRXCHG2_RTN_B32_vi : DS_Real_vi<0x2e, DS_WRXCHG2_RTN_B32>;
+def DS_WRXCHG2ST64_RTN_B32_vi : DS_Real_vi<0x2f, DS_WRXCHG2ST64_RTN_B32>;
+def DS_CMPST_RTN_B32_vi : DS_Real_vi<0x30, DS_CMPST_RTN_B32>;
+def DS_CMPST_RTN_F32_vi : DS_Real_vi<0x31, DS_CMPST_RTN_F32>;
+def DS_MIN_RTN_F32_vi : DS_Real_vi<0x32, DS_MIN_RTN_F32>;
+def DS_MAX_RTN_F32_vi : DS_Real_vi<0x33, DS_MAX_RTN_F32>;
+def DS_WRAP_RTN_F32_vi : DS_Real_vi<0x34, DS_WRAP_RTN_F32>;
+def DS_ADD_RTN_F32_vi : DS_Real_vi<0x35, DS_ADD_RTN_F32>;
+def DS_READ_B32_vi : DS_Real_vi<0x36, DS_READ_B32>;
+def DS_READ2_B32_vi : DS_Real_vi<0x37, DS_READ2_B32>;
+def DS_READ2ST64_B32_vi : DS_Real_vi<0x38, DS_READ2ST64_B32>;
+def DS_READ_I8_vi : DS_Real_vi<0x39, DS_READ_I8>;
+def DS_READ_U8_vi : DS_Real_vi<0x3a, DS_READ_U8>;
+def DS_READ_I16_vi : DS_Real_vi<0x3b, DS_READ_I16>;
+def DS_READ_U16_vi : DS_Real_vi<0x3c, DS_READ_U16>;
+def DS_SWIZZLE_B32_vi : DS_Real_vi<0x3d, DS_SWIZZLE_B32>;
+def DS_PERMUTE_B32_vi : DS_Real_vi<0x3e, DS_PERMUTE_B32>;
+def DS_BPERMUTE_B32_vi : DS_Real_vi<0x3f, DS_BPERMUTE_B32>;
+
+def DS_ADD_U64_vi : DS_Real_vi<0x40, DS_ADD_U64>;
+def DS_SUB_U64_vi : DS_Real_vi<0x41, DS_SUB_U64>;
+def DS_RSUB_U64_vi : DS_Real_vi<0x42, DS_RSUB_U64>;
+def DS_INC_U64_vi : DS_Real_vi<0x43, DS_INC_U64>;
+def DS_DEC_U64_vi : DS_Real_vi<0x44, DS_DEC_U64>;
+def DS_MIN_I64_vi : DS_Real_vi<0x45, DS_MIN_I64>;
+def DS_MAX_I64_vi : DS_Real_vi<0x46, DS_MAX_I64>;
+def DS_MIN_U64_vi : DS_Real_vi<0x47, DS_MIN_U64>;
+def DS_MAX_U64_vi : DS_Real_vi<0x48, DS_MAX_U64>;
+def DS_AND_B64_vi : DS_Real_vi<0x49, DS_AND_B64>;
+def DS_OR_B64_vi : DS_Real_vi<0x4a, DS_OR_B64>;
+def DS_XOR_B64_vi : DS_Real_vi<0x4b, DS_XOR_B64>;
+def DS_MSKOR_B64_vi : DS_Real_vi<0x4c, DS_MSKOR_B64>;
+def DS_WRITE_B64_vi : DS_Real_vi<0x4d, DS_WRITE_B64>;
+def DS_WRITE2_B64_vi : DS_Real_vi<0x4E, DS_WRITE2_B64>;
+def DS_WRITE2ST64_B64_vi : DS_Real_vi<0x4f, DS_WRITE2ST64_B64>;
+def DS_CMPST_B64_vi : DS_Real_vi<0x50, DS_CMPST_B64>;
+def DS_CMPST_F64_vi : DS_Real_vi<0x51, DS_CMPST_F64>;
+def DS_MIN_F64_vi : DS_Real_vi<0x52, DS_MIN_F64>;
+def DS_MAX_F64_vi : DS_Real_vi<0x53, DS_MAX_F64>;
+
+def DS_ADD_RTN_U64_vi : DS_Real_vi<0x60, DS_ADD_RTN_U64>;
+def DS_SUB_RTN_U64_vi : DS_Real_vi<0x61, DS_SUB_RTN_U64>;
+def DS_RSUB_RTN_U64_vi : DS_Real_vi<0x62, DS_RSUB_RTN_U64>;
+def DS_INC_RTN_U64_vi : DS_Real_vi<0x63, DS_INC_RTN_U64>;
+def DS_DEC_RTN_U64_vi : DS_Real_vi<0x64, DS_DEC_RTN_U64>;
+def DS_MIN_RTN_I64_vi : DS_Real_vi<0x65, DS_MIN_RTN_I64>;
+def DS_MAX_RTN_I64_vi : DS_Real_vi<0x66, DS_MAX_RTN_I64>;
+def DS_MIN_RTN_U64_vi : DS_Real_vi<0x67, DS_MIN_RTN_U64>;
+def DS_MAX_RTN_U64_vi : DS_Real_vi<0x68, DS_MAX_RTN_U64>;
+def DS_AND_RTN_B64_vi : DS_Real_vi<0x69, DS_AND_RTN_B64>;
+def DS_OR_RTN_B64_vi : DS_Real_vi<0x6a, DS_OR_RTN_B64>;
+def DS_XOR_RTN_B64_vi : DS_Real_vi<0x6b, DS_XOR_RTN_B64>;
+def DS_MSKOR_RTN_B64_vi : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>;
+def DS_WRXCHG_RTN_B64_vi : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>;
+def DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>;
+def DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>;
+def DS_CMPST_RTN_B64_vi : DS_Real_vi<0x70, DS_CMPST_RTN_B64>;
+def DS_CMPST_RTN_F64_vi : DS_Real_vi<0x71, DS_CMPST_RTN_F64>;
+def DS_MIN_RTN_F64_vi : DS_Real_vi<0x72, DS_MIN_RTN_F64>;
+def DS_MAX_RTN_F64_vi : DS_Real_vi<0x73, DS_MAX_RTN_F64>;
+
+def DS_READ_B64_vi : DS_Real_vi<0x76, DS_READ_B64>;
+def DS_READ2_B64_vi : DS_Real_vi<0x77, DS_READ2_B64>;
+def DS_READ2ST64_B64_vi : DS_Real_vi<0x78, DS_READ2ST64_B64>;
+
+def DS_ADD_SRC2_U32_vi : DS_Real_vi<0x80, DS_ADD_SRC2_U32>;
+def DS_SUB_SRC2_U32_vi : DS_Real_vi<0x81, DS_SUB_SRC2_U32>;
+def DS_RSUB_SRC2_U32_vi : DS_Real_vi<0x82, DS_RSUB_SRC2_U32>;
+def DS_INC_SRC2_U32_vi : DS_Real_vi<0x83, DS_INC_SRC2_U32>;
+def DS_DEC_SRC2_U32_vi : DS_Real_vi<0x84, DS_DEC_SRC2_U32>;
+def DS_MIN_SRC2_I32_vi : DS_Real_vi<0x85, DS_MIN_SRC2_I32>;
+def DS_MAX_SRC2_I32_vi : DS_Real_vi<0x86, DS_MAX_SRC2_I32>;
+def DS_MIN_SRC2_U32_vi : DS_Real_vi<0x87, DS_MIN_SRC2_U32>;
+def DS_MAX_SRC2_U32_vi : DS_Real_vi<0x88, DS_MAX_SRC2_U32>;
+def DS_AND_SRC2_B32_vi : DS_Real_vi<0x89, DS_AND_SRC2_B32>;
+def DS_OR_SRC2_B32_vi : DS_Real_vi<0x8a, DS_OR_SRC2_B32>;
+def DS_XOR_SRC2_B32_vi : DS_Real_vi<0x8b, DS_XOR_SRC2_B32>;
+def DS_WRITE_SRC2_B32_vi : DS_Real_vi<0x8d, DS_WRITE_SRC2_B32>;
+def DS_MIN_SRC2_F32_vi : DS_Real_vi<0x92, DS_MIN_SRC2_F32>;
+def DS_MAX_SRC2_F32_vi : DS_Real_vi<0x93, DS_MAX_SRC2_F32>;
+def DS_ADD_SRC2_U64_vi : DS_Real_vi<0xc0, DS_ADD_SRC2_U64>;
+def DS_SUB_SRC2_U64_vi : DS_Real_vi<0xc1, DS_SUB_SRC2_U64>;
+def DS_RSUB_SRC2_U64_vi : DS_Real_vi<0xc2, DS_RSUB_SRC2_U64>;
+def DS_INC_SRC2_U64_vi : DS_Real_vi<0xc3, DS_INC_SRC2_U64>;
+def DS_DEC_SRC2_U64_vi : DS_Real_vi<0xc4, DS_DEC_SRC2_U64>;
+def DS_MIN_SRC2_I64_vi : DS_Real_vi<0xc5, DS_MIN_SRC2_I64>;
+def DS_MAX_SRC2_I64_vi : DS_Real_vi<0xc6, DS_MAX_SRC2_I64>;
+def DS_MIN_SRC2_U64_vi : DS_Real_vi<0xc7, DS_MIN_SRC2_U64>;
+def DS_MAX_SRC2_U64_vi : DS_Real_vi<0xc8, DS_MAX_SRC2_U64>;
+def DS_AND_SRC2_B64_vi : DS_Real_vi<0xc9, DS_AND_SRC2_B64>;
+def DS_OR_SRC2_B64_vi : DS_Real_vi<0xca, DS_OR_SRC2_B64>;
+def DS_XOR_SRC2_B64_vi : DS_Real_vi<0xcb, DS_XOR_SRC2_B64>;
+def DS_WRITE_SRC2_B64_vi : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>;
+def DS_MIN_SRC2_F64_vi : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>;
+def DS_MAX_SRC2_F64_vi : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index e11de85..2247cad 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -28,6 +28,7 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ELF.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/TargetRegistry.h"
@@ -48,6 +49,18 @@ addOperand(MCInst &Inst, const MCOperand& Opnd) {
MCDisassembler::SoftFail;
}
+static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+
+ APInt SignedOffset(18, Imm * 4, true);
+ int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue();
+
+ if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2))
+ return MCDisassembler::Success;
+ return addOperand(Inst, MCOperand::createImm(Imm));
+}
+
#define DECODE_OPERAND2(RegClass, DecName) \
static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \
unsigned Imm, \
@@ -68,12 +81,22 @@ DECODE_OPERAND(VReg_96)
DECODE_OPERAND(VReg_128)
DECODE_OPERAND(SReg_32)
-DECODE_OPERAND(SReg_32_XM0)
+DECODE_OPERAND(SReg_32_XM0_XEXEC)
DECODE_OPERAND(SReg_64)
+DECODE_OPERAND(SReg_64_XEXEC)
DECODE_OPERAND(SReg_128)
DECODE_OPERAND(SReg_256)
DECODE_OPERAND(SReg_512)
+
+static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
+}
+
#define GET_SUBTARGETINFO_ENUM
#include "AMDGPUGenSubtargetInfo.inc"
#undef GET_SUBTARGETINFO_ENUM
@@ -217,12 +240,14 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
// ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in
// this bundle?
default:
- assert(false);
- break;
+ llvm_unreachable("unhandled register class");
}
- if (Val % (1 << shift))
+
+ if (Val % (1 << shift)) {
*CommentStream << "Warning: " << getRegClassName(SRegClassID)
<< ": scalar reg isn't aligned " << Val;
+ }
+
return createRegOperand(SRegClassID, Val >> shift);
}
@@ -234,7 +259,16 @@ MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const {
return decodeSrcOp(OPW64, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
+ return decodeSrcOp(OPW16, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
+ // Some instructions have operand restrictions beyond what the encoding
+ // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
+ // high bit.
+ Val &= 255;
+
return createRegOperand(AMDGPU::VGPR_32RegClassID, Val);
}
@@ -257,13 +291,17 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const {
return decodeSrcOp(OPW32, Val);
}
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0(unsigned Val) const {
- // SReg_32_XM0 is SReg_32 without M0
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0_XEXEC(
+ unsigned Val) const {
+ // SReg_32_XM0 is SReg_32 without M0 or EXEC_LO/EXEC_HI
return decodeOperand_SReg_32(Val);
}
MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const {
- // see decodeOperand_SReg_32 comment
+ return decodeSrcOp(OPW64, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_64_XEXEC(unsigned Val) const {
return decodeSrcOp(OPW64, Val);
}
@@ -299,28 +337,96 @@ MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
// Cast prevents negative overflow.
}
-MCOperand AMDGPUDisassembler::decodeFPImmed(bool Is32, unsigned Imm) {
+static int64_t getInlineImmVal32(unsigned Imm) {
+ switch (Imm) {
+ case 240:
+ return FloatToBits(0.5f);
+ case 241:
+ return FloatToBits(-0.5f);
+ case 242:
+ return FloatToBits(1.0f);
+ case 243:
+ return FloatToBits(-1.0f);
+ case 244:
+ return FloatToBits(2.0f);
+ case 245:
+ return FloatToBits(-2.0f);
+ case 246:
+ return FloatToBits(4.0f);
+ case 247:
+ return FloatToBits(-4.0f);
+ case 248: // 1 / (2 * PI)
+ return 0x3e22f983;
+ default:
+ llvm_unreachable("invalid fp inline imm");
+ }
+}
+
+static int64_t getInlineImmVal64(unsigned Imm) {
+ switch (Imm) {
+ case 240:
+ return DoubleToBits(0.5);
+ case 241:
+ return DoubleToBits(-0.5);
+ case 242:
+ return DoubleToBits(1.0);
+ case 243:
+ return DoubleToBits(-1.0);
+ case 244:
+ return DoubleToBits(2.0);
+ case 245:
+ return DoubleToBits(-2.0);
+ case 246:
+ return DoubleToBits(4.0);
+ case 247:
+ return DoubleToBits(-4.0);
+ case 248: // 1 / (2 * PI)
+ return 0x3fc45f306dc9c882;
+ default:
+ llvm_unreachable("invalid fp inline imm");
+ }
+}
+
+static int64_t getInlineImmVal16(unsigned Imm) {
+ switch (Imm) {
+ case 240:
+ return 0x3800;
+ case 241:
+ return 0xB800;
+ case 242:
+ return 0x3C00;
+ case 243:
+ return 0xBC00;
+ case 244:
+ return 0x4000;
+ case 245:
+ return 0xC000;
+ case 246:
+ return 0x4400;
+ case 247:
+ return 0xC400;
+ case 248: // 1 / (2 * PI)
+ return 0x3118;
+ default:
+ llvm_unreachable("invalid fp inline imm");
+ }
+}
+
+MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN
&& Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
+
// ToDo: case 248: 1/(2*PI) - is allowed only on VI
- // ToDo: AMDGPUInstPrinter does not support 1/(2*PI). It consider 1/(2*PI) as
- // literal constant.
- float V = 0.0f;
- switch (Imm) {
- case 240: V = 0.5f; break;
- case 241: V = -0.5f; break;
- case 242: V = 1.0f; break;
- case 243: V = -1.0f; break;
- case 244: V = 2.0f; break;
- case 245: V = -2.0f; break;
- case 246: V = 4.0f; break;
- case 247: V = -4.0f; break;
- case 248: return MCOperand::createImm(Is32 ? // 1/(2*PI)
- 0x3e22f983 :
- 0x3fc45f306dc9c882);
- default: break;
+ switch (Width) {
+ case OPW32:
+ return MCOperand::createImm(getInlineImmVal32(Imm));
+ case OPW64:
+ return MCOperand::createImm(getInlineImmVal64(Imm));
+ case OPW16:
+ return MCOperand::createImm(getInlineImmVal16(Imm));
+ default:
+ llvm_unreachable("implement me");
}
- return MCOperand::createImm(Is32? FloatToBits(V) : DoubleToBits(V));
}
unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
@@ -328,7 +434,9 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
switch (Width) {
default: // fall
- case OPW32: return VGPR_32RegClassID;
+ case OPW32:
+ case OPW16:
+ return VGPR_32RegClassID;
case OPW64: return VReg_64RegClassID;
case OPW128: return VReg_128RegClassID;
}
@@ -339,7 +447,9 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
switch (Width) {
default: // fall
- case OPW32: return SGPR_32RegClassID;
+ case OPW32:
+ case OPW16:
+ return SGPR_32RegClassID;
case OPW64: return SGPR_64RegClassID;
case OPW128: return SGPR_128RegClassID;
}
@@ -350,7 +460,9 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
switch (Width) {
default: // fall
- case OPW32: return TTMP_32RegClassID;
+ case OPW32:
+ case OPW16:
+ return TTMP_32RegClassID;
case OPW64: return TTMP_64RegClassID;
case OPW128: return TTMP_128RegClassID;
}
@@ -371,19 +483,26 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
}
- assert(Width == OPW32 || Width == OPW64);
- const bool Is32 = (Width == OPW32);
+ assert(Width == OPW16 || Width == OPW32 || Width == OPW64);
if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
return decodeIntImmed(Val);
if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
- return decodeFPImmed(Is32, Val);
+ return decodeFPImmed(Width, Val);
if (Val == LITERAL_CONST)
return decodeLiteralConstant();
- return Is32 ? decodeSpecialReg32(Val) : decodeSpecialReg64(Val);
+ switch (Width) {
+ case OPW32:
+ case OPW16:
+ return decodeSpecialReg32(Val);
+ case OPW64:
+ return decodeSpecialReg64(Val);
+ default:
+ llvm_unreachable("unexpected immediate type");
+ }
}
MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
@@ -426,6 +545,56 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
return errOperand(Val, "unknown operand encoding " + Twine(Val));
}
+//===----------------------------------------------------------------------===//
+// AMDGPUSymbolizer
+//===----------------------------------------------------------------------===//
+
+// Try to find symbol name for specified label
+bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
+ raw_ostream &/*cStream*/, int64_t Value,
+ uint64_t /*Address*/, bool IsBranch,
+ uint64_t /*Offset*/, uint64_t /*InstSize*/) {
+ typedef std::tuple<uint64_t, StringRef, uint8_t> SymbolInfoTy;
+ typedef std::vector<SymbolInfoTy> SectionSymbolsTy;
+
+ if (!IsBranch) {
+ return false;
+ }
+
+ auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo);
+ auto Result = std::find_if(Symbols->begin(), Symbols->end(),
+ [Value](const SymbolInfoTy& Val) {
+ return std::get<0>(Val) == static_cast<uint64_t>(Value)
+ && std::get<2>(Val) == ELF::STT_NOTYPE;
+ });
+ if (Result != Symbols->end()) {
+ auto *Sym = Ctx.getOrCreateSymbol(std::get<1>(*Result));
+ const auto *Add = MCSymbolRefExpr::create(Sym, Ctx);
+ Inst.addOperand(MCOperand::createExpr(Add));
+ return true;
+ }
+ return false;
+}
+
+void AMDGPUSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
+ int64_t Value,
+ uint64_t Address) {
+ llvm_unreachable("unimplemented");
+}
+
+//===----------------------------------------------------------------------===//
+// Initialization
+//===----------------------------------------------------------------------===//
+
+static MCSymbolizer *createAMDGPUSymbolizer(const Triple &/*TT*/,
+ LLVMOpInfoCallback /*GetOpInfo*/,
+ LLVMSymbolLookupCallback /*SymbolLookUp*/,
+ void *DisInfo,
+ MCContext *Ctx,
+ std::unique_ptr<MCRelocationInfo> &&RelInfo) {
+ return new AMDGPUSymbolizer(*Ctx, std::move(RelInfo), DisInfo);
+}
+
static MCDisassembler *createAMDGPUDisassembler(const Target &T,
const MCSubtargetInfo &STI,
MCContext &Ctx) {
@@ -433,5 +602,8 @@ static MCDisassembler *createAMDGPUDisassembler(const Target &T,
}
extern "C" void LLVMInitializeAMDGPUDisassembler() {
- TargetRegistry::RegisterMCDisassembler(TheGCNTarget, createAMDGPUDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheGCNTarget(),
+ createAMDGPUDisassembler);
+ TargetRegistry::RegisterMCSymbolizer(getTheGCNTarget(),
+ createAMDGPUSymbolizer);
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index dff26a0..ee5883a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -18,76 +18,113 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
+#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
+#include <cstdint>
+#include <algorithm>
+#include <memory>
namespace llvm {
- class MCContext;
- class MCInst;
- class MCOperand;
- class MCSubtargetInfo;
- class Twine;
-
- class AMDGPUDisassembler : public MCDisassembler {
- private:
- mutable ArrayRef<uint8_t> Bytes;
-
- public:
- AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
- MCDisassembler(STI, Ctx) {}
-
- ~AMDGPUDisassembler() {}
-
- DecodeStatus getInstruction(MCInst &MI, uint64_t &Size,
- ArrayRef<uint8_t> Bytes, uint64_t Address,
- raw_ostream &WS, raw_ostream &CS) const override;
-
- const char* getRegClassName(unsigned RegClassID) const;
-
- MCOperand createRegOperand(unsigned int RegId) const;
- MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const;
- MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const;
-
- MCOperand errOperand(unsigned V, const llvm::Twine& ErrMsg) const;
-
- DecodeStatus tryDecodeInst(const uint8_t* Table,
- MCInst &MI,
- uint64_t Inst,
- uint64_t Address) const;
-
- MCOperand decodeOperand_VGPR_32(unsigned Val) const;
- MCOperand decodeOperand_VS_32(unsigned Val) const;
- MCOperand decodeOperand_VS_64(unsigned Val) const;
-
- MCOperand decodeOperand_VReg_64(unsigned Val) const;
- MCOperand decodeOperand_VReg_96(unsigned Val) const;
- MCOperand decodeOperand_VReg_128(unsigned Val) const;
-
- MCOperand decodeOperand_SReg_32(unsigned Val) const;
- MCOperand decodeOperand_SReg_32_XM0(unsigned Val) const;
- MCOperand decodeOperand_SReg_64(unsigned Val) const;
- MCOperand decodeOperand_SReg_128(unsigned Val) const;
- MCOperand decodeOperand_SReg_256(unsigned Val) const;
- MCOperand decodeOperand_SReg_512(unsigned Val) const;
-
- enum OpWidthTy {
- OPW32,
- OPW64,
- OPW128,
- OPW_LAST_,
- OPW_FIRST_ = OPW32
- };
- unsigned getVgprClassId(const OpWidthTy Width) const;
- unsigned getSgprClassId(const OpWidthTy Width) const;
- unsigned getTtmpClassId(const OpWidthTy Width) const;
-
- static MCOperand decodeIntImmed(unsigned Imm);
- static MCOperand decodeFPImmed(bool Is32, unsigned Imm);
- MCOperand decodeLiteralConstant() const;
-
- MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
- MCOperand decodeSpecialReg32(unsigned Val) const;
- MCOperand decodeSpecialReg64(unsigned Val) const;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSubtargetInfo;
+class Twine;
+
+//===----------------------------------------------------------------------===//
+// AMDGPUDisassembler
+//===----------------------------------------------------------------------===//
+
+class AMDGPUDisassembler : public MCDisassembler {
+private:
+ mutable ArrayRef<uint8_t> Bytes;
+
+public:
+ AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+ MCDisassembler(STI, Ctx) {}
+
+ ~AMDGPUDisassembler() override = default;
+
+ DecodeStatus getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &WS, raw_ostream &CS) const override;
+
+ const char* getRegClassName(unsigned RegClassID) const;
+
+ MCOperand createRegOperand(unsigned int RegId) const;
+ MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const;
+ MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const;
+
+ MCOperand errOperand(unsigned V, const Twine& ErrMsg) const;
+
+ DecodeStatus tryDecodeInst(const uint8_t* Table,
+ MCInst &MI,
+ uint64_t Inst,
+ uint64_t Address) const;
+
+ MCOperand decodeOperand_VGPR_32(unsigned Val) const;
+ MCOperand decodeOperand_VS_32(unsigned Val) const;
+ MCOperand decodeOperand_VS_64(unsigned Val) const;
+ MCOperand decodeOperand_VSrc16(unsigned Val) const;
+
+ MCOperand decodeOperand_VReg_64(unsigned Val) const;
+ MCOperand decodeOperand_VReg_96(unsigned Val) const;
+ MCOperand decodeOperand_VReg_128(unsigned Val) const;
+
+ MCOperand decodeOperand_SReg_32(unsigned Val) const;
+ MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const;
+ MCOperand decodeOperand_SReg_64(unsigned Val) const;
+ MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const;
+ MCOperand decodeOperand_SReg_128(unsigned Val) const;
+ MCOperand decodeOperand_SReg_256(unsigned Val) const;
+ MCOperand decodeOperand_SReg_512(unsigned Val) const;
+
+ enum OpWidthTy {
+ OPW32,
+ OPW64,
+ OPW128,
+ OPW16,
+ OPW_LAST_,
+ OPW_FIRST_ = OPW32
};
-} // namespace llvm
-#endif //LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
+ unsigned getVgprClassId(const OpWidthTy Width) const;
+ unsigned getSgprClassId(const OpWidthTy Width) const;
+ unsigned getTtmpClassId(const OpWidthTy Width) const;
+
+ static MCOperand decodeIntImmed(unsigned Imm);
+ static MCOperand decodeFPImmed(OpWidthTy Width, unsigned Imm);
+ MCOperand decodeLiteralConstant() const;
+
+ MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
+ MCOperand decodeSpecialReg32(unsigned Val) const;
+ MCOperand decodeSpecialReg64(unsigned Val) const;
+};
+
+//===----------------------------------------------------------------------===//
+// AMDGPUSymbolizer
+//===----------------------------------------------------------------------===//
+
+class AMDGPUSymbolizer : public MCSymbolizer {
+private:
+ void *DisInfo;
+
+public:
+ AMDGPUSymbolizer(MCContext &Ctx, std::unique_ptr<MCRelocationInfo> &&RelInfo,
+ void *disInfo)
+ : MCSymbolizer(Ctx, std::move(RelInfo)), DisInfo(disInfo) {}
+
+ bool tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &cStream,
+ int64_t Value, uint64_t Address,
+ bool IsBranch, uint64_t Offset,
+ uint64_t InstSize) override;
+
+ void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
+ int64_t Value,
+ uint64_t Address) override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 94f05cc..48c6592 100644
--- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -72,6 +72,8 @@ def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
def MULHI_INT_eg : MULHI_INT_Common<0x90>;
def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
+def MULHI_UINT24_eg : MULHI_UINT24_Common<0xb2>;
+
def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
@@ -116,14 +118,13 @@ def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>;
} // End usesCustomInserter = 1
-class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
- : VTX_WORD0_eg, VTX_READ<name, buffer_id, outs, pattern> {
+class VTX_READ_eg <string name, dag outs>
+ : VTX_WORD0_eg, VTX_READ<name, outs, []> {
// Static fields
let VC_INST = 0;
let FETCH_TYPE = 2;
let FETCH_WHOLE_QUAD = 0;
- let BUFFER_ID = buffer_id;
let SRC_REL = 0;
// XXX: We can infer this field based on the SRC_GPR. This would allow us
// to store vertex addresses in any channel, not just X.
@@ -132,9 +133,9 @@ class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
let Inst{31-0} = Word0;
}
-class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
- : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
- (outs R600_TReg32_X:$dst_gpr), pattern> {
+def VTX_READ_8_eg
+ : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr",
+ (outs R600_TReg32_X:$dst_gpr)> {
let MEGA_FETCH_COUNT = 1;
let DST_SEL_X = 0;
@@ -144,9 +145,9 @@ class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
let DATA_FORMAT = 1; // FMT_8
}
-class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
- : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
- (outs R600_TReg32_X:$dst_gpr), pattern> {
+def VTX_READ_16_eg
+ : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr",
+ (outs R600_TReg32_X:$dst_gpr)> {
let MEGA_FETCH_COUNT = 2;
let DST_SEL_X = 0;
let DST_SEL_Y = 7; // Masked
@@ -156,9 +157,9 @@ class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
}
-class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
- : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
- (outs R600_TReg32_X:$dst_gpr), pattern> {
+def VTX_READ_32_eg
+ : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr",
+ (outs R600_TReg32_X:$dst_gpr)> {
let MEGA_FETCH_COUNT = 4;
let DST_SEL_X = 0;
@@ -177,9 +178,9 @@ class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
let Constraints = "$src_gpr.ptr = $dst_gpr";
}
-class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
- : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id,
- (outs R600_Reg64:$dst_gpr), pattern> {
+def VTX_READ_64_eg
+ : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr",
+ (outs R600_Reg64:$dst_gpr)> {
let MEGA_FETCH_COUNT = 8;
let DST_SEL_X = 0;
@@ -189,9 +190,9 @@ class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
let DATA_FORMAT = 0x1D; // COLOR_32_32
}
-class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
- : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
- (outs R600_Reg128:$dst_gpr), pattern> {
+def VTX_READ_128_eg
+ : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr",
+ (outs R600_Reg128:$dst_gpr)> {
let MEGA_FETCH_COUNT = 16;
let DST_SEL_X = 0;
@@ -209,80 +210,44 @@ class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
//===----------------------------------------------------------------------===//
// VTX Read from parameter memory space
//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_8_eg MEMxi:$src_gpr, 3)>;
+def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_16_eg MEMxi:$src_gpr, 3)>;
+def : Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_32_eg MEMxi:$src_gpr, 3)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_64_eg MEMxi:$src_gpr, 3)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_128_eg MEMxi:$src_gpr, 3)>;
-def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <3,
- [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <3,
- [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <3,
- [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <3,
- [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <3,
- [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
+//===----------------------------------------------------------------------===//
+// VTX Read from constant memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_8_eg MEMxi:$src_gpr, 2)>;
+def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_16_eg MEMxi:$src_gpr, 2)>;
+def : Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_32_eg MEMxi:$src_gpr, 2)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_64_eg MEMxi:$src_gpr, 2)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_128_eg MEMxi:$src_gpr, 2)>;
//===----------------------------------------------------------------------===//
// VTX Read from global memory space
//===----------------------------------------------------------------------===//
-
-// 8-bit reads
-def VTX_READ_ID1_8_eg : VTX_READ_8_eg <1,
- [(set i32:$dst_gpr, (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr))]
->;
-
-// 16-bit reads
-def VTX_READ_ID1_16_eg : VTX_READ_16_eg <1,
- [(set i32:$dst_gpr, (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr))]
->;
-
-// 32-bit reads
-def VTX_READ_ID1_32_eg : VTX_READ_32_eg <1,
- [(set i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 64-bit reads
-def VTX_READ_ID1_64_eg : VTX_READ_64_eg <1,
- [(set v2i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 128-bit reads
-def VTX_READ_ID1_128_eg : VTX_READ_128_eg <1,
- [(set v4i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 8-bit reads
-def VTX_READ_ID2_8_eg : VTX_READ_8_eg <2,
- [(set i32:$dst_gpr, (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr))]
->;
-
-// 16-bit reads
-def VTX_READ_ID2_16_eg : VTX_READ_16_eg <2,
- [(set i32:$dst_gpr, (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr))]
->;
-
-// 32-bit reads
-def VTX_READ_ID2_32_eg : VTX_READ_32_eg <2,
- [(set i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 64-bit reads
-def VTX_READ_ID2_64_eg : VTX_READ_64_eg <2,
- [(set v2i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 128-bit reads
-def VTX_READ_ID2_128_eg : VTX_READ_128_eg <2,
- [(set v4i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
->;
+def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_8_eg MEMxi:$src_gpr, 1)>;
+def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_16_eg MEMxi:$src_gpr, 1)>;
+def : Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_32_eg MEMxi:$src_gpr, 1)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_64_eg MEMxi:$src_gpr, 1)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+ (VTX_READ_128_eg MEMxi:$src_gpr, 1)>;
} // End Predicates = [isEG]
@@ -368,11 +333,13 @@ def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
def DOT4_eg : DOT4_Common<0xBE>;
defm CUBE_eg : CUBE_Common<0xC0>;
-def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
+def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", fp_to_f16, VecALU>;
+def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>;
+def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td
new file mode 100644
index 0000000..849fb8a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -0,0 +1,530 @@
+//===-- FLATInstructions.td - FLAT Instruction Defintions -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">;
+
+//===----------------------------------------------------------------------===//
+// FLAT classes
+//===----------------------------------------------------------------------===//
+
+class FLAT_Pseudo<string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ InstSI<outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+
+ let SubtargetPredicate = isCIVI;
+
+ let FLAT = 1;
+ // Internally, FLAT instruction are executed as both an LDS and a
+ // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
+ // and are not considered done until both have been decremented.
+ let VM_CNT = 1;
+ let LGKM_CNT = 1;
+
+ let Uses = [EXEC, FLAT_SCR]; // M0
+
+ let UseNamedOperandTable = 1;
+ let hasSideEffects = 0;
+ let SchedRW = [WriteVMEM];
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ bits<1> has_vdst = 1;
+ bits<1> has_data = 1;
+ bits<1> has_glc = 1;
+ bits<1> glcValue = 0;
+}
+
+class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ Enc64 {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+
+ // encoding fields
+ bits<8> vaddr;
+ bits<8> vdata;
+ bits<8> vdst;
+ bits<1> slc;
+ bits<1> glc;
+ bits<1> tfe;
+
+ // 15-0 is reserved.
+ let Inst{16} = !if(ps.has_glc, glc, ps.glcValue);
+ let Inst{17} = slc;
+ let Inst{24-18} = op;
+ let Inst{31-26} = 0x37; // Encoding.
+ let Inst{39-32} = vaddr;
+ let Inst{47-40} = !if(ps.has_data, vdata, ?);
+ // 54-48 is reserved.
+ let Inst{55} = tfe;
+ let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
+}
+
+class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
+ opName,
+ (outs regClass:$vdst),
+ (ins VReg_64:$vaddr, GLC:$glc, slc:$slc, tfe:$tfe),
+ " $vdst, $vaddr$glc$slc$tfe"> {
+ let has_data = 0;
+ let mayLoad = 1;
+}
+
+class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass> : FLAT_Pseudo<
+ opName,
+ (outs),
+ (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc, tfe:$tfe),
+ " $vaddr, $vdata$glc$slc$tfe"> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let has_vdst = 0;
+}
+
+multiclass FLAT_Atomic_Pseudo<
+ string opName,
+ RegisterClass vdst_rc,
+ ValueType vt,
+ SDPatternOperator atomic = null_frag,
+ ValueType data_vt = vt,
+ RegisterClass data_rc = vdst_rc> {
+
+ def "" : FLAT_Pseudo <opName,
+ (outs),
+ (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
+ " $vaddr, $vdata$slc$tfe",
+ []>,
+ AtomicNoRet <NAME, 0> {
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_glc = 0;
+ let glcValue = 0;
+ let has_vdst = 0;
+ let PseudoInstr = NAME;
+ }
+
+ def _RTN : FLAT_Pseudo <opName,
+ (outs vdst_rc:$vdst),
+ (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
+ " $vdst, $vaddr, $vdata glc$slc$tfe",
+ [(set vt:$vdst,
+ (atomic (FLATAtomic i64:$vaddr, i1:$slc, i1:$tfe), data_vt:$vdata))]>,
+ AtomicNoRet <NAME, 1> {
+ let mayLoad = 1;
+ let mayStore = 1;
+ let hasPostISelHook = 1;
+ let has_glc = 0;
+ let glcValue = 1;
+ let PseudoInstr = NAME # "_RTN";
+ }
+}
+
+class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
+ (ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}]
+>;
+
+def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>;
+def atomic_swap_flat : flat_binary_atomic_op<atomic_swap>;
+def atomic_add_flat : flat_binary_atomic_op<atomic_load_add>;
+def atomic_and_flat : flat_binary_atomic_op<atomic_load_and>;
+def atomic_max_flat : flat_binary_atomic_op<atomic_load_max>;
+def atomic_min_flat : flat_binary_atomic_op<atomic_load_min>;
+def atomic_or_flat : flat_binary_atomic_op<atomic_load_or>;
+def atomic_sub_flat : flat_binary_atomic_op<atomic_load_sub>;
+def atomic_umax_flat : flat_binary_atomic_op<atomic_load_umax>;
+def atomic_umin_flat : flat_binary_atomic_op<atomic_load_umin>;
+def atomic_xor_flat : flat_binary_atomic_op<atomic_load_xor>;
+def atomic_inc_flat : flat_binary_atomic_op<SIatomic_inc>;
+def atomic_dec_flat : flat_binary_atomic_op<SIatomic_dec>;
+
+
+
+//===----------------------------------------------------------------------===//
+// Flat Instructions
+//===----------------------------------------------------------------------===//
+
+def FLAT_LOAD_UBYTE : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
+def FLAT_LOAD_SBYTE : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
+def FLAT_LOAD_USHORT : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>;
+def FLAT_LOAD_SSHORT : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>;
+def FLAT_LOAD_DWORD : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>;
+def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
+def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
+def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
+
+def FLAT_STORE_BYTE : FLAT_Store_Pseudo <"flat_store_byte", VGPR_32>;
+def FLAT_STORE_SHORT : FLAT_Store_Pseudo <"flat_store_short", VGPR_32>;
+def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>;
+def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
+def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
+def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
+
+defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
+ VGPR_32, i32, atomic_cmp_swap_flat,
+ v2i32, VReg_64>;
+
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap_x2",
+ VReg_64, i64, atomic_cmp_swap_flat,
+ v2i64, VReg_128>;
+
+defm FLAT_ATOMIC_SWAP : FLAT_Atomic_Pseudo <"flat_atomic_swap",
+ VGPR_32, i32, atomic_swap_flat>;
+
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_swap_x2",
+ VReg_64, i64, atomic_swap_flat>;
+
+defm FLAT_ATOMIC_ADD : FLAT_Atomic_Pseudo <"flat_atomic_add",
+ VGPR_32, i32, atomic_add_flat>;
+
+defm FLAT_ATOMIC_SUB : FLAT_Atomic_Pseudo <"flat_atomic_sub",
+ VGPR_32, i32, atomic_sub_flat>;
+
+defm FLAT_ATOMIC_SMIN : FLAT_Atomic_Pseudo <"flat_atomic_smin",
+ VGPR_32, i32, atomic_min_flat>;
+
+defm FLAT_ATOMIC_UMIN : FLAT_Atomic_Pseudo <"flat_atomic_umin",
+ VGPR_32, i32, atomic_umin_flat>;
+
+defm FLAT_ATOMIC_SMAX : FLAT_Atomic_Pseudo <"flat_atomic_smax",
+ VGPR_32, i32, atomic_max_flat>;
+
+defm FLAT_ATOMIC_UMAX : FLAT_Atomic_Pseudo <"flat_atomic_umax",
+ VGPR_32, i32, atomic_umax_flat>;
+
+defm FLAT_ATOMIC_AND : FLAT_Atomic_Pseudo <"flat_atomic_and",
+ VGPR_32, i32, atomic_and_flat>;
+
+defm FLAT_ATOMIC_OR : FLAT_Atomic_Pseudo <"flat_atomic_or",
+ VGPR_32, i32, atomic_or_flat>;
+
+defm FLAT_ATOMIC_XOR : FLAT_Atomic_Pseudo <"flat_atomic_xor",
+ VGPR_32, i32, atomic_xor_flat>;
+
+defm FLAT_ATOMIC_INC : FLAT_Atomic_Pseudo <"flat_atomic_inc",
+ VGPR_32, i32, atomic_inc_flat>;
+
+defm FLAT_ATOMIC_DEC : FLAT_Atomic_Pseudo <"flat_atomic_dec",
+ VGPR_32, i32, atomic_dec_flat>;
+
+defm FLAT_ATOMIC_ADD_X2 : FLAT_Atomic_Pseudo <"flat_atomic_add_x2",
+ VReg_64, i64, atomic_add_flat>;
+
+defm FLAT_ATOMIC_SUB_X2 : FLAT_Atomic_Pseudo <"flat_atomic_sub_x2",
+ VReg_64, i64, atomic_sub_flat>;
+
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smin_x2",
+ VReg_64, i64, atomic_min_flat>;
+
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umin_x2",
+ VReg_64, i64, atomic_umin_flat>;
+
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smax_x2",
+ VReg_64, i64, atomic_max_flat>;
+
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umax_x2",
+ VReg_64, i64, atomic_umax_flat>;
+
+defm FLAT_ATOMIC_AND_X2 : FLAT_Atomic_Pseudo <"flat_atomic_and_x2",
+ VReg_64, i64, atomic_and_flat>;
+
+defm FLAT_ATOMIC_OR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_or_x2",
+ VReg_64, i64, atomic_or_flat>;
+
+defm FLAT_ATOMIC_XOR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_xor_x2",
+ VReg_64, i64, atomic_xor_flat>;
+
+defm FLAT_ATOMIC_INC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2",
+ VReg_64, i64, atomic_inc_flat>;
+
+defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
+ VReg_64, i64, atomic_dec_flat>;
+
+let SubtargetPredicate = isCI in { // CI Only flat instructions : FIXME Only?
+
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap",
+ VGPR_32, f32, null_frag, v2f32, VReg_64>;
+
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
+ VReg_64, f64, null_frag, v2f64, VReg_128>;
+
+defm FLAT_ATOMIC_FMIN : FLAT_Atomic_Pseudo <"flat_atomic_fmin",
+ VGPR_32, f32>;
+
+defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax",
+ VGPR_32, f32>;
+
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2",
+ VReg_64, f64>;
+
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
+ VReg_64, f64>;
+
+} // End SubtargetPredicate = isCI
+
+//===----------------------------------------------------------------------===//
+// Flat Patterns
+//===----------------------------------------------------------------------===//
+
+class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
+ (ld node:$ptr), [{
+ auto const AS = cast<MemSDNode>(N)->getAddressSpace();
+ return AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS;
+}]>;
+
+class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ auto const AS = cast<MemSDNode>(N)->getAddressSpace();
+ return AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::GLOBAL_ADDRESS;
+}]>;
+
+def atomic_flat_load : flat_ld <atomic_load>;
+def flat_load : flat_ld <load>;
+def flat_az_extloadi8 : flat_ld <az_extloadi8>;
+def flat_sextloadi8 : flat_ld <sextloadi8>;
+def flat_az_extloadi16 : flat_ld <az_extloadi16>;
+def flat_sextloadi16 : flat_ld <sextloadi16>;
+
+def atomic_flat_store : flat_st <atomic_store>;
+def flat_store : flat_st <store>;
+def flat_truncstorei8 : flat_st <truncstorei8>;
+def flat_truncstorei16 : flat_st <truncstorei16>;
+
+// Patterns for global loads with no offset.
+class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+ (vt (node i64:$addr)),
+ (inst $addr, 0, 0, 0)
+>;
+
+class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+ (vt (node i64:$addr)),
+ (inst $addr, 1, 0, 0)
+>;
+
+class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+ (node vt:$data, i64:$addr),
+ (inst $addr, $data, 0, 0, 0)
+>;
+
+class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+ // atomic store follows atomic binop convention so the address comes
+ // first.
+ (node i64:$addr, vt:$data),
+ (inst $addr, $data, 1, 0, 0)
+>;
+
+class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
+ ValueType data_vt = vt> : Pat <
+ (vt (node i64:$addr, data_vt:$data)),
+ (inst $addr, $data, 0, 0)
+>;
+
+let Predicates = [isCIVI] in {
+
+def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i16>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i16>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>;
+
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_flat_load, i32>;
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_flat_load, i64>;
+
+def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;
+
+def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_flat_store, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_flat_store, i64>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
+
+} // End Predicates = [isCIVI]
+
+let Predicates = [isVI] in {
+ def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i16>;
+ def : FlatStorePat <FLAT_STORE_SHORT, flat_store, i16>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps> :
+ FLAT_Real <op, ps>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SI> {
+ let AssemblerPredicate = isCIOnly;
+ let DecoderNamespace="CI";
+}
+
+def FLAT_LOAD_UBYTE_ci : FLAT_Real_ci <0x8, FLAT_LOAD_UBYTE>;
+def FLAT_LOAD_SBYTE_ci : FLAT_Real_ci <0x9, FLAT_LOAD_SBYTE>;
+def FLAT_LOAD_USHORT_ci : FLAT_Real_ci <0xa, FLAT_LOAD_USHORT>;
+def FLAT_LOAD_SSHORT_ci : FLAT_Real_ci <0xb, FLAT_LOAD_SSHORT>;
+def FLAT_LOAD_DWORD_ci : FLAT_Real_ci <0xc, FLAT_LOAD_DWORD>;
+def FLAT_LOAD_DWORDX2_ci : FLAT_Real_ci <0xd, FLAT_LOAD_DWORDX2>;
+def FLAT_LOAD_DWORDX4_ci : FLAT_Real_ci <0xe, FLAT_LOAD_DWORDX4>;
+def FLAT_LOAD_DWORDX3_ci : FLAT_Real_ci <0xf, FLAT_LOAD_DWORDX3>;
+
+def FLAT_STORE_BYTE_ci : FLAT_Real_ci <0x18, FLAT_STORE_BYTE>;
+def FLAT_STORE_SHORT_ci : FLAT_Real_ci <0x1a, FLAT_STORE_SHORT>;
+def FLAT_STORE_DWORD_ci : FLAT_Real_ci <0x1c, FLAT_STORE_DWORD>;
+def FLAT_STORE_DWORDX2_ci : FLAT_Real_ci <0x1d, FLAT_STORE_DWORDX2>;
+def FLAT_STORE_DWORDX4_ci : FLAT_Real_ci <0x1e, FLAT_STORE_DWORDX4>;
+def FLAT_STORE_DWORDX3_ci : FLAT_Real_ci <0x1f, FLAT_STORE_DWORDX3>;
+
+multiclass FLAT_Real_Atomics_ci <bits<7> op, FLAT_Pseudo ps> {
+ def _ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
+ def _RTN_ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
+}
+
+defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_ci <0x30, FLAT_ATOMIC_SWAP>;
+defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_ci <0x31, FLAT_ATOMIC_CMPSWAP>;
+defm FLAT_ATOMIC_ADD : FLAT_Real_Atomics_ci <0x32, FLAT_ATOMIC_ADD>;
+defm FLAT_ATOMIC_SUB : FLAT_Real_Atomics_ci <0x33, FLAT_ATOMIC_SUB>;
+defm FLAT_ATOMIC_SMIN : FLAT_Real_Atomics_ci <0x35, FLAT_ATOMIC_SMIN>;
+defm FLAT_ATOMIC_UMIN : FLAT_Real_Atomics_ci <0x36, FLAT_ATOMIC_UMIN>;
+defm FLAT_ATOMIC_SMAX : FLAT_Real_Atomics_ci <0x37, FLAT_ATOMIC_SMAX>;
+defm FLAT_ATOMIC_UMAX : FLAT_Real_Atomics_ci <0x38, FLAT_ATOMIC_UMAX>;
+defm FLAT_ATOMIC_AND : FLAT_Real_Atomics_ci <0x39, FLAT_ATOMIC_AND>;
+defm FLAT_ATOMIC_OR : FLAT_Real_Atomics_ci <0x3a, FLAT_ATOMIC_OR>;
+defm FLAT_ATOMIC_XOR : FLAT_Real_Atomics_ci <0x3b, FLAT_ATOMIC_XOR>;
+defm FLAT_ATOMIC_INC : FLAT_Real_Atomics_ci <0x3c, FLAT_ATOMIC_INC>;
+defm FLAT_ATOMIC_DEC : FLAT_Real_Atomics_ci <0x3d, FLAT_ATOMIC_DEC>;
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_Real_Atomics_ci <0x50, FLAT_ATOMIC_SWAP_X2>;
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_ci <0x51, FLAT_ATOMIC_CMPSWAP_X2>;
+defm FLAT_ATOMIC_ADD_X2 : FLAT_Real_Atomics_ci <0x52, FLAT_ATOMIC_ADD_X2>;
+defm FLAT_ATOMIC_SUB_X2 : FLAT_Real_Atomics_ci <0x53, FLAT_ATOMIC_SUB_X2>;
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_Real_Atomics_ci <0x55, FLAT_ATOMIC_SMIN_X2>;
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_Real_Atomics_ci <0x56, FLAT_ATOMIC_UMIN_X2>;
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_Real_Atomics_ci <0x57, FLAT_ATOMIC_SMAX_X2>;
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_Real_Atomics_ci <0x58, FLAT_ATOMIC_UMAX_X2>;
+defm FLAT_ATOMIC_AND_X2 : FLAT_Real_Atomics_ci <0x59, FLAT_ATOMIC_AND_X2>;
+defm FLAT_ATOMIC_OR_X2 : FLAT_Real_Atomics_ci <0x5a, FLAT_ATOMIC_OR_X2>;
+defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_ci <0x5b, FLAT_ATOMIC_XOR_X2>;
+defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_ci <0x5c, FLAT_ATOMIC_INC_X2>;
+defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_ci <0x5d, FLAT_ATOMIC_DEC_X2>;
+
+// CI Only flat instructions
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_Real_Atomics_ci <0x3e, FLAT_ATOMIC_FCMPSWAP>;
+defm FLAT_ATOMIC_FMIN : FLAT_Real_Atomics_ci <0x3f, FLAT_ATOMIC_FMIN>;
+defm FLAT_ATOMIC_FMAX : FLAT_Real_Atomics_ci <0x40, FLAT_ATOMIC_FMAX>;
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_ci <0x5e, FLAT_ATOMIC_FCMPSWAP_X2>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_ci <0x5f, FLAT_ATOMIC_FMIN_X2>;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, FLAT_ATOMIC_FMAX_X2>;
+
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> :
+ FLAT_Real <op, ps>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+ let AssemblerPredicate = isVI;
+ let DecoderNamespace="VI";
+}
+
+def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
+def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
+def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
+def FLAT_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, FLAT_LOAD_SSHORT>;
+def FLAT_LOAD_DWORD_vi : FLAT_Real_vi <0x14, FLAT_LOAD_DWORD>;
+def FLAT_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, FLAT_LOAD_DWORDX2>;
+def FLAT_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>;
+def FLAT_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>;
+
+def FLAT_STORE_BYTE_vi : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>;
+def FLAT_STORE_SHORT_vi : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>;
+def FLAT_STORE_DWORD_vi : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>;
+def FLAT_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>;
+def FLAT_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>;
+def FLAT_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>;
+
+multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps> {
+ def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
+ def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
+}
+
+defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_vi <0x40, FLAT_ATOMIC_SWAP>;
+defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_vi <0x41, FLAT_ATOMIC_CMPSWAP>;
+defm FLAT_ATOMIC_ADD : FLAT_Real_Atomics_vi <0x42, FLAT_ATOMIC_ADD>;
+defm FLAT_ATOMIC_SUB : FLAT_Real_Atomics_vi <0x43, FLAT_ATOMIC_SUB>;
+defm FLAT_ATOMIC_SMIN : FLAT_Real_Atomics_vi <0x44, FLAT_ATOMIC_SMIN>;
+defm FLAT_ATOMIC_UMIN : FLAT_Real_Atomics_vi <0x45, FLAT_ATOMIC_UMIN>;
+defm FLAT_ATOMIC_SMAX : FLAT_Real_Atomics_vi <0x46, FLAT_ATOMIC_SMAX>;
+defm FLAT_ATOMIC_UMAX : FLAT_Real_Atomics_vi <0x47, FLAT_ATOMIC_UMAX>;
+defm FLAT_ATOMIC_AND : FLAT_Real_Atomics_vi <0x48, FLAT_ATOMIC_AND>;
+defm FLAT_ATOMIC_OR : FLAT_Real_Atomics_vi <0x49, FLAT_ATOMIC_OR>;
+defm FLAT_ATOMIC_XOR : FLAT_Real_Atomics_vi <0x4a, FLAT_ATOMIC_XOR>;
+defm FLAT_ATOMIC_INC : FLAT_Real_Atomics_vi <0x4b, FLAT_ATOMIC_INC>;
+defm FLAT_ATOMIC_DEC : FLAT_Real_Atomics_vi <0x4c, FLAT_ATOMIC_DEC>;
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_Real_Atomics_vi <0x60, FLAT_ATOMIC_SWAP_X2>;
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_vi <0x61, FLAT_ATOMIC_CMPSWAP_X2>;
+defm FLAT_ATOMIC_ADD_X2 : FLAT_Real_Atomics_vi <0x62, FLAT_ATOMIC_ADD_X2>;
+defm FLAT_ATOMIC_SUB_X2 : FLAT_Real_Atomics_vi <0x63, FLAT_ATOMIC_SUB_X2>;
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_Real_Atomics_vi <0x64, FLAT_ATOMIC_SMIN_X2>;
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_Real_Atomics_vi <0x65, FLAT_ATOMIC_UMIN_X2>;
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_Real_Atomics_vi <0x66, FLAT_ATOMIC_SMAX_X2>;
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_Real_Atomics_vi <0x67, FLAT_ATOMIC_UMAX_X2>;
+defm FLAT_ATOMIC_AND_X2 : FLAT_Real_Atomics_vi <0x68, FLAT_ATOMIC_AND_X2>;
+defm FLAT_ATOMIC_OR_X2 : FLAT_Real_Atomics_vi <0x69, FLAT_ATOMIC_OR_X2>;
+defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>;
+defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>;
+defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>;
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 29b1f79..dd3b46f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -38,6 +38,33 @@ void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
CurrCycleInstr = MI;
}
+static bool isDivFMas(unsigned Opcode) {
+ return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
+}
+
+static bool isSGetReg(unsigned Opcode) {
+ return Opcode == AMDGPU::S_GETREG_B32;
+}
+
+static bool isSSetReg(unsigned Opcode) {
+ return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
+}
+
+static bool isRWLane(unsigned Opcode) {
+ return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
+}
+
+static bool isRFE(unsigned Opcode) {
+ return Opcode == AMDGPU::S_RFE_B64;
+}
+
+static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
+
+ const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
+ AMDGPU::OpName::simm16);
+ return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
+}
+
ScheduleHazardRecognizer::HazardType
GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
MachineInstr *MI = SU->getInstr();
@@ -48,9 +75,27 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
return NoopHazard;
+ if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
+ return NoopHazard;
+
if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
return NoopHazard;
+ if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
+ return NoopHazard;
+
+ if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
+ return NoopHazard;
+
+ if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
+ return NoopHazard;
+
+ if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
+ return NoopHazard;
+
+ if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
+ return NoopHazard;
+
return NoHazard;
}
@@ -62,11 +107,32 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
if (SIInstrInfo::isSMRD(*MI))
return std::max(0, checkSMRDHazards(MI));
- if (SIInstrInfo::isVMEM(*MI))
- return std::max(0, checkVMEMHazards(MI));
+ if (SIInstrInfo::isVALU(*MI)) {
+ int WaitStates = std::max(0, checkVALUHazards(MI));
- if (SIInstrInfo::isDPP(*MI))
- return std::max(0, checkDPPHazards(MI));
+ if (SIInstrInfo::isVMEM(*MI))
+ WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
+
+ if (SIInstrInfo::isDPP(*MI))
+ WaitStates = std::max(WaitStates, checkDPPHazards(MI));
+
+ if (isDivFMas(MI->getOpcode()))
+ WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
+
+ if (isRWLane(MI->getOpcode()))
+ WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
+
+ return WaitStates;
+ }
+
+ if (isSGetReg(MI->getOpcode()))
+ return std::max(0, checkGetRegHazards(MI));
+
+ if (isSSetReg(MI->getOpcode()))
+ return std::max(0, checkSetRegHazards(MI));
+
+ if (isRFE(MI->getOpcode()))
+ return std::max(0, checkRFEHazards(MI));
return 0;
}
@@ -112,21 +178,40 @@ void GCNHazardRecognizer::RecedeCycle() {
// Helper Functions
//===----------------------------------------------------------------------===//
-int GCNHazardRecognizer::getWaitStatesSinceDef(
- unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
+int GCNHazardRecognizer::getWaitStatesSince(
+ function_ref<bool(MachineInstr *)> IsHazard) {
int WaitStates = -1;
for (MachineInstr *MI : EmittedInstrs) {
++WaitStates;
- if (!MI || !IsHazardDef(MI))
+ if (!MI || !IsHazard(MI))
continue;
- if (MI->modifiesRegister(Reg, TRI))
- return WaitStates;
+ return WaitStates;
}
return std::numeric_limits<int>::max();
}
+int GCNHazardRecognizer::getWaitStatesSinceDef(
+ unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
+ return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
+ };
+
+ return getWaitStatesSince(IsHazardFn);
+}
+
+int GCNHazardRecognizer::getWaitStatesSinceSetReg(
+ function_ref<bool(MachineInstr *)> IsHazard) {
+
+ auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
+ return isSSetReg(MI->getOpcode()) && IsHazard(MI);
+ };
+
+ return getWaitStatesSince(IsHazardFn);
+}
+
//===----------------------------------------------------------------------===//
// No-op Hazard Detection
//===----------------------------------------------------------------------===//
@@ -262,3 +347,156 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
return WaitStatesNeeded;
}
+
+int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ // v_div_fmas requires 4 wait states after a write to vcc from a VALU
+ // instruction.
+ const int DivFMasWaitStates = 4;
+ auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+ int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn);
+
+ return DivFMasWaitStates - WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
+
+ const int GetRegWaitStates = 2;
+ auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
+ return GetRegHWReg == getHWReg(TII, *MI);
+ };
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+
+ return GetRegWaitStates - WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ unsigned HWReg = getHWReg(TII, *SetRegInstr);
+
+ const int SetRegWaitStates =
+ ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ? 1 : 2;
+ auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
+ return HWReg == getHWReg(TII, *MI);
+ };
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ return SetRegWaitStates - WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
+ if (!MI.mayStore())
+ return -1;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MI.getDesc();
+
+ int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
+ int VDataRCID = -1;
+ if (VDataIdx != -1)
+ VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
+
+ if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
+ // There is no hazard if the instruction does not use vector regs
+ // (like wbinvl1)
+ if (VDataIdx == -1)
+ return -1;
+ // For MUBUF/MTBUF instructions this hazard only exists if the
+ // instruction is not using a register in the soffset field.
+ const MachineOperand *SOffset =
+ TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+ // If we have no soffset operand, then assume this field has been
+ // hardcoded to zero.
+ if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
+ (!SOffset || !SOffset->isReg()))
+ return VDataIdx;
+ }
+
+ // MIMG instructions create a hazard if they don't use a 256-bit T# and
+ // the store size is greater than 8 bytes and they have more than two bits
+ // of their dmask set.
+ // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
+ if (TII->isMIMG(MI)) {
+ int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
+ assert(SRsrcIdx != -1 &&
+ AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
+ (void)SRsrcIdx;
+ }
+
+ if (TII->isFLAT(MI)) {
+ int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
+ if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
+ return DataIdx;
+ }
+
+ return -1;
+}
+
+int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
+ // This checks for the hazard where VMEM instructions that store more than
+ // 8 bytes can have there store data over written by the next instruction.
+ if (!ST.has12DWordStoreHazard())
+ return 0;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const MachineRegisterInfo &MRI = VALU->getParent()->getParent()->getRegInfo();
+
+ const int VALUWaitStates = 1;
+ int WaitStatesNeeded = 0;
+
+ for (const MachineOperand &Def : VALU->defs()) {
+ if (!TRI->isVGPR(MRI, Def.getReg()))
+ continue;
+ unsigned Reg = Def.getReg();
+ auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
+ int DataIdx = createsVALUHazard(*MI);
+ return DataIdx >= 0 &&
+ TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
+ };
+ int WaitStatesNeededForDef =
+ VALUWaitStates - getWaitStatesSince(IsHazardFn);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+ }
+ return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const MachineRegisterInfo &MRI =
+ RWLane->getParent()->getParent()->getRegInfo();
+
+ const MachineOperand *LaneSelectOp =
+ TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
+
+ if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
+ return 0;
+
+ unsigned LaneSelectReg = LaneSelectOp->getReg();
+ auto IsHazardFn = [TII] (MachineInstr *MI) {
+ return TII->isVALU(*MI);
+ };
+
+ const int RWLaneWaitStates = 4;
+ int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn);
+ return RWLaneWaitStates - WaitStatesSince;
+}
+
+int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
+
+ if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return 0;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ const int RFEWaitStates = 1;
+
+ auto IsHazardFn = [TII] (MachineInstr *MI) {
+ return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
+ };
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ return RFEWaitStates - WaitStatesNeeded;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index d82041c..0ab82ff 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -35,14 +35,23 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
const MachineFunction &MF;
const SISubtarget &ST;
+ int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
int getWaitStatesSinceDef(unsigned Reg,
function_ref<bool(MachineInstr *)> IsHazardDef =
[](MachineInstr *) { return true; });
+ int getWaitStatesSinceSetReg(function_ref<bool(MachineInstr *)> IsHazard);
int checkSMEMSoftClauseHazards(MachineInstr *SMEM);
int checkSMRDHazards(MachineInstr *SMRD);
int checkVMEMHazards(MachineInstr* VMEM);
int checkDPPHazards(MachineInstr *DPP);
+ int checkDivFMasHazards(MachineInstr *DivFMas);
+ int checkGetRegHazards(MachineInstr *GetRegInstr);
+ int checkSetRegHazards(MachineInstr *SetRegInstr);
+ int createsVALUHazard(const MachineInstr &MI);
+ int checkVALUHazards(MachineInstr *VALU);
+ int checkRWLaneHazards(MachineInstr *RWLane);
+ int checkRFEHazards(MachineInstr *RFE);
public:
GCNHazardRecognizer(const MachineFunction &MF);
// We can only issue one instruction per cycle.
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
new file mode 100644
index 0000000..2f88033
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -0,0 +1,312 @@
+//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This contains a MachineSchedStrategy implementation for maximizing wave
+/// occupancy on GCN hardware.
+//===----------------------------------------------------------------------===//
+
+#include "GCNSchedStrategy.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+#define DEBUG_TYPE "misched"
+
+using namespace llvm;
+
+GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
+ const MachineSchedContext *C) :
+ GenericScheduler(C) { }
+
+static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
+ const MachineFunction &MF) {
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
+ ST.getOccupancyWithNumVGPRs(VGPRs));
+ return std::min(MinRegOccupancy,
+ ST.getOccupancyWithLocalMemSize(MFI->getLDSSize()));
+}
+
+void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
+ bool AtTop, const RegPressureTracker &RPTracker,
+ const SIRegisterInfo *SRI,
+ int SGPRPressure,
+ int VGPRPressure,
+ int SGPRExcessLimit,
+ int VGPRExcessLimit,
+ int SGPRCriticalLimit,
+ int VGPRCriticalLimit) {
+
+ Cand.SU = SU;
+ Cand.AtTop = AtTop;
+
+ // getDownwardPressure() and getUpwardPressure() make temporary changes to
+ // the the tracker, so we need to pass those function a non-const copy.
+ RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+ std::vector<unsigned> Pressure;
+ std::vector<unsigned> MaxPressure;
+
+ if (AtTop)
+ TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ else {
+ // FIXME: I think for bottom up scheduling, the register pressure is cached
+ // and can be retrieved by DAG->getPressureDif(SU).
+ TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ }
+
+ int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
+ int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+
+ // If two instructions increase the pressure of different register sets
+ // by the same amount, the generic scheduler will prefer to schedule the
+ // instruction that increases the set with the least amount of registers,
+ // which in our case would be SGPRs. This is rarely what we want, so
+ // when we report excess/critical register pressure, we do it either
+ // only for VGPRs or only for SGPRs.
+
+ // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
+ const int MaxVGPRPressureInc = 16;
+ bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
+ bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
+
+
+ // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
+ // to increase the likelihood we don't go over the limits. We should improve
+ // the analysis to look through dependencies to find the path with the least
+ // register pressure.
+ // FIXME: This is also necessary, because some passes that run after
+ // scheduling and before regalloc increase register pressure.
+ const int ErrorMargin = 3;
+ VGPRExcessLimit -= ErrorMargin;
+ SGPRExcessLimit -= ErrorMargin;
+
+ // We only need to update the RPDelata for instructions that increase
+ // register pressure. Instructions that decrease or keep reg pressure
+ // the same will be marked as RegExcess in tryCandidate() when they
+ // are compared with instructions that increase the register pressure.
+ if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
+ Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet());
+ Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
+ }
+
+ if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
+ Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet());
+ Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure = SGPRExcessLimit);
+ }
+
+ // Register pressure is considered 'CRITICAL' if it is approaching a value
+ // that would reduce the wave occupancy for the execution unit. When
+ // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both
+ // has the same cost, so we don't need to prefer one over the other.
+
+ VGPRCriticalLimit -= ErrorMargin;
+ SGPRCriticalLimit -= ErrorMargin;
+
+ int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
+ int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
+
+ if (SGPRDelta >= 0 || VGPRDelta >= 0) {
+ if (SGPRDelta > VGPRDelta) {
+ Cand.RPDelta.CriticalMax = PressureChange(SRI->getSGPRPressureSet());
+ Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
+ } else {
+ Cand.RPDelta.CriticalMax = PressureChange(SRI->getVGPRPressureSet());
+ Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
+ }
+ }
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNodeFromQueue()
+void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
+ const CandPolicy &ZonePolicy,
+ const RegPressureTracker &RPTracker,
+ SchedCandidate &Cand) {
+ const SISubtarget &ST = DAG->MF.getSubtarget<SISubtarget>();
+ const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+ ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
+ unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()];
+ unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+ unsigned SGPRExcessLimit =
+ Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
+ unsigned VGPRExcessLimit =
+ Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
+ unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF);
+ unsigned SGPRCriticalLimit = SRI->getMaxNumSGPRs(ST, MaxWaves, true);
+ unsigned VGPRCriticalLimit = SRI->getMaxNumVGPRs(MaxWaves);
+
+ ReadyQueue &Q = Zone.Available;
+ for (SUnit *SU : Q) {
+
+ SchedCandidate TryCand(ZonePolicy);
+ initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
+ SGPRPressure, VGPRPressure,
+ SGPRExcessLimit, VGPRExcessLimit,
+ SGPRCriticalLimit, VGPRCriticalLimit);
+ // Pass SchedBoundary only when comparing nodes from the same boundary.
+ SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+ GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg);
+ if (TryCand.Reason != NoCand) {
+ // Initialize resource delta if needed in case future heuristics query it.
+ if (TryCand.ResDelta == SchedResourceDelta())
+ TryCand.initResourceDelta(Zone.DAG, SchedModel);
+ Cand.setBest(TryCand);
+ }
+ }
+}
+
+static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) {
+ switch (Reason) {
+ default:
+ return Reason;
+ case GenericSchedulerBase::RegCritical:
+ case GenericSchedulerBase::RegExcess:
+ return -Reason;
+ }
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNodeBidirectional()
+SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+ // Schedule as far as possible in the direction of no choice. This is most
+ // efficient, but also provides the best heuristics for CriticalPSets.
+ if (SUnit *SU = Bot.pickOnlyChoice()) {
+ IsTopNode = false;
+ return SU;
+ }
+ if (SUnit *SU = Top.pickOnlyChoice()) {
+ IsTopNode = true;
+ return SU;
+ }
+ // Set the bottom-up policy based on the state of the current bottom zone and
+ // the instructions outside the zone, including the top zone.
+ CandPolicy BotPolicy;
+ setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
+ // Set the top-down policy based on the state of the current top zone and
+ // the instructions outside the zone, including the bottom zone.
+ CandPolicy TopPolicy;
+ setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
+
+ // See if BotCand is still valid (because we previously scheduled from Top).
+ DEBUG(dbgs() << "Picking from Bot:\n");
+ if (!BotCand.isValid() || BotCand.SU->isScheduled ||
+ BotCand.Policy != BotPolicy) {
+ BotCand.reset(CandPolicy());
+ pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
+ assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+ } else {
+ DEBUG(traceCandidate(BotCand));
+ }
+
+ // Check if the top Q has a better candidate.
+ DEBUG(dbgs() << "Picking from Top:\n");
+ if (!TopCand.isValid() || TopCand.SU->isScheduled ||
+ TopCand.Policy != TopPolicy) {
+ TopCand.reset(CandPolicy());
+ pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
+ assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+ } else {
+ DEBUG(traceCandidate(TopCand));
+ }
+
+ // Pick best from BotCand and TopCand.
+ DEBUG(
+ dbgs() << "Top Cand: ";
+ traceCandidate(BotCand);
+ dbgs() << "Bot Cand: ";
+ traceCandidate(TopCand);
+ );
+ SchedCandidate Cand;
+ if (TopCand.Reason == BotCand.Reason) {
+ Cand = BotCand;
+ GenericSchedulerBase::CandReason TopReason = TopCand.Reason;
+ TopCand.Reason = NoCand;
+ GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
+ if (TopCand.Reason != NoCand) {
+ Cand.setBest(TopCand);
+ } else {
+ TopCand.Reason = TopReason;
+ }
+ } else {
+ if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) {
+ Cand = TopCand;
+ } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) {
+ Cand = BotCand;
+ } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
+ Cand = TopCand;
+ } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
+ Cand = BotCand;
+ } else {
+ int TopRank = getBidirectionalReasonRank(TopCand.Reason);
+ int BotRank = getBidirectionalReasonRank(BotCand.Reason);
+ if (TopRank > BotRank) {
+ Cand = TopCand;
+ } else {
+ Cand = BotCand;
+ }
+ }
+ }
+ DEBUG(
+ dbgs() << "Picking: ";
+ traceCandidate(Cand);
+ );
+
+ IsTopNode = Cand.AtTop;
+ return Cand.SU;
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNode()
+SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
+ if (DAG->top() == DAG->bottom()) {
+ assert(Top.Available.empty() && Top.Pending.empty() &&
+ Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+ return nullptr;
+ }
+ SUnit *SU;
+ do {
+ if (RegionPolicy.OnlyTopDown) {
+ SU = Top.pickOnlyChoice();
+ if (!SU) {
+ CandPolicy NoPolicy;
+ TopCand.reset(NoPolicy);
+ pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
+ assert(TopCand.Reason != NoCand && "failed to find a candidate");
+ SU = TopCand.SU;
+ }
+ IsTopNode = true;
+ } else if (RegionPolicy.OnlyBottomUp) {
+ SU = Bot.pickOnlyChoice();
+ if (!SU) {
+ CandPolicy NoPolicy;
+ BotCand.reset(NoPolicy);
+ pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
+ assert(BotCand.Reason != NoCand && "failed to find a candidate");
+ SU = BotCand.SU;
+ }
+ IsTopNode = false;
+ } else {
+ SU = pickNodeBidirectional(IsTopNode);
+ }
+ } while (SU->isScheduled);
+
+ if (SU->isTopReady())
+ Top.removeReady(SU);
+ if (SU->isBottomReady())
+ Bot.removeReady(SU);
+
+ DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+ return SU;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
new file mode 100644
index 0000000..4cfc0ce
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -0,0 +1,54 @@
+//===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+class SIRegisterInfo;
+
+/// This is a minimal scheduler strategy. The main difference between this
+/// and the GenericScheduler is that GCNSchedStrategy uses different
+/// heuristics to determine excess/critical pressure sets. Its goal is to
+/// maximize kernel occupancy (i.e. maximum number of waves per simd).
+class GCNMaxOccupancySchedStrategy : public GenericScheduler {
+
+ SUnit *pickNodeBidirectional(bool &IsTopNode);
+
+ void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
+ const RegPressureTracker &RPTracker,
+ SchedCandidate &Cand);
+
+ void initCandidate(SchedCandidate &Cand, SUnit *SU,
+ bool AtTop, const RegPressureTracker &RPTracker,
+ const SIRegisterInfo *SRI,
+ int SGPRPressure, int VGPRPressure,
+ int SGPRExcessLimit, int VGPRExcessLimit,
+ int SGPRCriticalLimit, int VGPRCriticalLimit);
+
+ void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone, const SIRegisterInfo *SRI,
+ unsigned SGPRPressure, unsigned VGPRPressure);
+
+public:
+ GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
+
+ SUnit *pickNode(bool &IsTopNode) override;
+};
+
+} // End namespace llvm
+
+#endif // GCNSCHEDSTRATEGY_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index 2932d3b..7172a0a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -9,46 +9,52 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUInstPrinter.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUAsmUtils.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-
-#include <string>
+#include <cassert>
using namespace llvm;
+using namespace llvm::AMDGPU;
void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
StringRef Annot, const MCSubtargetInfo &STI) {
OS.flush();
- printInstruction(MI, OS);
-
+ printInstruction(MI, STI, OS);
printAnnotation(OS, Annot);
}
void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
O << formatHex(MI->getOperand(OpNo).getImm() & 0xf);
}
void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ raw_ostream &O) {
O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
}
void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
- O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff);
-}
-
-void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
+ // It's possible to end up with a 32-bit literal used with a 16-bit operand
+ // with ignored high bits. Print as 32-bit anyway in that case.
+ int64_t Imm = MI->getOperand(OpNo).getImm();
+ if (isInt<16>(Imm) || isUInt<16>(Imm))
+ O << formatHex(static_cast<uint64_t>(Imm & 0xffff));
+ else
+ printU32ImmOperand(MI, OpNo, STI, O);
}
void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
@@ -66,8 +72,14 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
}
-void AMDGPUInstPrinter::printNamedBit(const MCInst* MI, unsigned OpNo,
- raw_ostream& O, StringRef BitName) {
+void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
+}
+
+void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O, StringRef BitName) {
if (MI->getOperand(OpNo).getImm()) {
O << ' ' << BitName;
}
@@ -97,7 +109,8 @@ void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
uint16_t Imm = MI->getOperand(OpNo).getImm();
if (Imm != 0) {
O << " offset:";
@@ -106,7 +119,8 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
if (MI->getOperand(OpNo).getImm()) {
O << " offset0:";
printU8ImmDecOperand(MI, OpNo, O);
@@ -114,74 +128,97 @@ void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
if (MI->getOperand(OpNo).getImm()) {
O << " offset1:";
printU8ImmDecOperand(MI, OpNo, O);
}
}
-void AMDGPUInstPrinter::printSMRDOffset(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printU32ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printSMRDOffset20(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
- printU32ImmOperand(MI, OpNo, O);
+ printU32ImmOperand(MI, OpNo, STI, O);
}
void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
- printU32ImmOperand(MI, OpNo, O);
+ printU32ImmOperand(MI, OpNo, STI, O);
}
void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "gds");
}
void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "glc");
}
void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "slc");
}
void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "tfe");
}
void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
if (MI->getOperand(OpNo).getImm()) {
O << " dmask:";
- printU16ImmOperand(MI, OpNo, O);
+ printU16ImmOperand(MI, OpNo, STI, O);
}
}
void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "unorm");
}
void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "da");
}
void AMDGPUInstPrinter::printR128(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "r128");
}
void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "lwe");
}
-void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O,
+void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " compr";
+}
+
+void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " vm";
+}
+
+void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
const MCRegisterInfo &MRI) {
- switch (reg) {
+ switch (RegNo) {
case AMDGPU::VCC:
O << "vcc";
return;
@@ -233,52 +270,54 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O,
// The low 8 bits of the encoding value is the register index, for both VGPRs
// and SGPRs.
- unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1);
+ unsigned RegIdx = MRI.getEncodingValue(RegNo) & ((1 << 8) - 1);
unsigned NumRegs;
- if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) {
+ if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(RegNo)) {
O << 'v';
NumRegs = 1;
- } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) {
+ } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(RegNo)) {
O << 's';
NumRegs = 1;
- } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) {
+ } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo)) {
O <<'v';
NumRegs = 2;
- } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(reg)) {
+ } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo)) {
O << 's';
NumRegs = 2;
- } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) {
+ } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo)) {
O << 'v';
NumRegs = 4;
- } else if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(reg)) {
+ } else if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo)) {
O << 's';
NumRegs = 4;
- } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) {
+ } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo)) {
O << 'v';
NumRegs = 3;
- } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) {
+ } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) {
O << 'v';
NumRegs = 8;
- } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) {
+ } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(RegNo)) {
O << 's';
NumRegs = 8;
- } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) {
+ } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo)) {
O << 'v';
NumRegs = 16;
- } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) {
+ } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(RegNo)) {
O << 's';
NumRegs = 16;
- } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(reg)) {
+ } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(RegNo)) {
O << "ttmp";
NumRegs = 2;
- RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen.
- } else if (MRI.getRegClass(AMDGPU::TTMP_128RegClassID).contains(reg)) {
+ // Trap temps start at offset 112. TODO: Get this from tablegen.
+ RegIdx -= 112;
+ } else if (MRI.getRegClass(AMDGPU::TTMP_128RegClassID).contains(RegNo)) {
O << "ttmp";
NumRegs = 4;
- RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen.
+ // Trap temps start at offset 112. TODO: Get this from tablegen.
+ RegIdx -= 112;
} else {
- O << getRegisterName(reg);
+ O << getRegisterName(RegNo);
return;
}
@@ -291,7 +330,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O,
}
void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
O << "_e64 ";
else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP)
@@ -301,10 +340,44 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
else
O << "_e32 ";
- printOperand(MI, OpNo, O);
+ printOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ int16_t SImm = static_cast<int16_t>(Imm);
+ if (SImm >= -16 && SImm <= 64) {
+ O << SImm;
+ return;
+ }
+
+ if (Imm == 0x3C00)
+ O<< "1.0";
+ else if (Imm == 0xBC00)
+ O<< "-1.0";
+ else if (Imm == 0x3800)
+ O<< "0.5";
+ else if (Imm == 0xB800)
+ O<< "-0.5";
+ else if (Imm == 0x4000)
+ O<< "2.0";
+ else if (Imm == 0xC000)
+ O<< "-2.0";
+ else if (Imm == 0x4400)
+ O<< "4.0";
+ else if (Imm == 0xC400)
+ O<< "-4.0";
+ else if (Imm == 0x3118) {
+ assert(STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]);
+ O << "0.15915494";
+ } else
+ O << formatHex(static_cast<uint64_t>(Imm));
}
-void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) {
+void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
int32_t SImm = static_cast<int32_t>(Imm);
if (SImm >= -16 && SImm <= 64) {
O << SImm;
@@ -329,11 +402,16 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) {
O << "4.0";
else if (Imm == FloatToBits(-4.0f))
O << "-4.0";
+ else if (Imm == 0x3e22f983 &&
+ STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+ O << "0.15915494";
else
O << formatHex(static_cast<uint64_t>(Imm));
}
-void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) {
+void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
int64_t SImm = static_cast<int64_t>(Imm);
if (SImm >= -16 && SImm <= 64) {
O << SImm;
@@ -358,8 +436,11 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) {
O << "4.0";
else if (Imm == DoubleToBits(-4.0))
O << "-4.0";
+ else if (Imm == 0x3fc45f306dc9c882 &&
+ STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+ O << "0.15915494";
else {
- assert(isUInt<32>(Imm));
+ assert(isUInt<32>(Imm) || Imm == 0x3fc45f306dc9c882);
// In rare situations, we will have a 32-bit literal in a 64-bit
// operand. This is technically allowed for the encoding of s_mov_b64.
@@ -368,7 +449,12 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) {
}
void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
+ if (OpNo >= MI->getNumOperands()) {
+ O << "/*Missing OP" << OpNo << "*/";
+ return;
+ }
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isReg()) {
@@ -383,22 +469,39 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
}
} else if (Op.isImm()) {
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- int RCID = Desc.OpInfo[OpNo].RegClass;
- if (RCID != -1) {
- const MCRegisterClass &ImmRC = MRI.getRegClass(RCID);
- if (ImmRC.getSize() == 4)
- printImmediate32(Op.getImm(), O);
- else if (ImmRC.getSize() == 8)
- printImmediate64(Op.getImm(), O);
- else
- llvm_unreachable("Invalid register class size");
- } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) {
- printImmediate32(Op.getImm(), O);
- } else {
+ switch (Desc.OpInfo[OpNo].OperandType) {
+ case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case MCOI::OPERAND_IMMEDIATE:
+ printImmediate32(Op.getImm(), STI, O);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ printImmediate64(Op.getImm(), STI, O);
+ break;
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_IMM_INT16:
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ printImmediate16(Op.getImm(), STI, O);
+ break;
+ case MCOI::OPERAND_UNKNOWN:
+ case MCOI::OPERAND_PCREL:
+ O << formatDec(Op.getImm());
+ break;
+ case MCOI::OPERAND_REGISTER:
+ // FIXME: This should be removed and handled somewhere else. Seems to come
+ // from a disassembler bug.
+ O << "/*invalid immediate*/";
+ break;
+ default:
// We hit this for the immediate instruction bits that don't yet have a
// custom printer.
- // TODO: Eventually this should be unnecessary.
- O << formatDec(Op.getImm());
+ llvm_unreachable("unexpected immediate operand type");
}
} else if (Op.isFPImm()) {
// We special case 0.0 because otherwise it will be printed as an integer.
@@ -406,12 +509,12 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
O << "0.0";
else {
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass);
-
- if (ImmRC.getSize() == 4)
- printImmediate32(FloatToBits(Op.getFPImm()), O);
- else if (ImmRC.getSize() == 8)
- printImmediate64(DoubleToBits(Op.getFPImm()), O);
+ int RCID = Desc.OpInfo[OpNo].RegClass;
+ unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID));
+ if (RCBits == 32)
+ printImmediate32(FloatToBits(Op.getFPImm()), STI, O);
+ else if (RCBits == 64)
+ printImmediate64(DoubleToBits(Op.getFPImm()), STI, O);
else
llvm_unreachable("Invalid register class size");
}
@@ -424,32 +527,34 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
- unsigned OpNo,
- raw_ostream &O) {
+ unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
unsigned InputModifiers = MI->getOperand(OpNo).getImm();
if (InputModifiers & SISrcMods::NEG)
O << '-';
if (InputModifiers & SISrcMods::ABS)
O << '|';
- printOperand(MI, OpNo + 1, O);
+ printOperand(MI, OpNo + 1, STI, O);
if (InputModifiers & SISrcMods::ABS)
O << '|';
}
void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
- unsigned OpNo,
- raw_ostream &O) {
+ unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
unsigned InputModifiers = MI->getOperand(OpNo).getImm();
if (InputModifiers & SISrcMods::SEXT)
O << "sext(";
- printOperand(MI, OpNo + 1, O);
+ printOperand(MI, OpNo + 1, STI, O);
if (InputModifiers & SISrcMods::SEXT)
O << ')';
}
-
void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
unsigned Imm = MI->getOperand(OpNo).getImm();
if (Imm <= 0x0ff) {
O << " quad_perm:[";
@@ -488,19 +593,22 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printRowMask(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
O << " row_mask:";
- printU4ImmOperand(MI, OpNo, O);
+ printU4ImmOperand(MI, OpNo, STI, O);
}
void AMDGPUInstPrinter::printBankMask(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
O << " bank_mask:";
- printU4ImmOperand(MI, OpNo, O);
+ printU4ImmOperand(MI, OpNo, STI, O);
}
void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
unsigned Imm = MI->getOperand(OpNo).getImm();
if (Imm) {
O << " bound_ctrl:0"; // XXX - this syntax is used in sp3
@@ -509,69 +617,180 @@ void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
+ using namespace llvm::AMDGPU::SDWA;
+
unsigned Imm = MI->getOperand(OpNo).getImm();
switch (Imm) {
- case 0: O << "BYTE_0"; break;
- case 1: O << "BYTE_1"; break;
- case 2: O << "BYTE_2"; break;
- case 3: O << "BYTE_3"; break;
- case 4: O << "WORD_0"; break;
- case 5: O << "WORD_1"; break;
- case 6: O << "DWORD"; break;
+ case SdwaSel::BYTE_0: O << "BYTE_0"; break;
+ case SdwaSel::BYTE_1: O << "BYTE_1"; break;
+ case SdwaSel::BYTE_2: O << "BYTE_2"; break;
+ case SdwaSel::BYTE_3: O << "BYTE_3"; break;
+ case SdwaSel::WORD_0: O << "WORD_0"; break;
+ case SdwaSel::WORD_1: O << "WORD_1"; break;
+ case SdwaSel::DWORD: O << "DWORD"; break;
default: llvm_unreachable("Invalid SDWA data select operand");
}
}
void AMDGPUInstPrinter::printSDWADstSel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
O << "dst_sel:";
printSDWASel(MI, OpNo, O);
}
void AMDGPUInstPrinter::printSDWASrc0Sel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
O << "src0_sel:";
printSDWASel(MI, OpNo, O);
}
void AMDGPUInstPrinter::printSDWASrc1Sel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
O << "src1_sel:";
printSDWASel(MI, OpNo, O);
}
void AMDGPUInstPrinter::printSDWADstUnused(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
+ using namespace llvm::AMDGPU::SDWA;
+
O << "dst_unused:";
unsigned Imm = MI->getOperand(OpNo).getImm();
switch (Imm) {
- case 0: O << "UNUSED_PAD"; break;
- case 1: O << "UNUSED_SEXT"; break;
- case 2: O << "UNUSED_PRESERVE"; break;
+ case DstUnused::UNUSED_PAD: O << "UNUSED_PAD"; break;
+ case DstUnused::UNUSED_SEXT: O << "UNUSED_SEXT"; break;
+ case DstUnused::UNUSED_PRESERVE: O << "UNUSED_PRESERVE"; break;
default: llvm_unreachable("Invalid SDWA dest_unused operand");
}
}
+template <unsigned N>
+void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ int EnIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::en);
+ unsigned En = MI->getOperand(EnIdx).getImm();
+
+ // FIXME: What do we do with compr? The meaning of en changes depending on if
+ // compr is set.
+
+ if (En & (1 << N))
+ printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI);
+ else
+ O << "off";
+}
+
+void AMDGPUInstPrinter::printExpSrc0(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printExpSrcN<0>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpSrc1(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printExpSrcN<1>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpSrc2(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printExpSrcN<2>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpSrc3(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printExpSrcN<3>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // This is really a 6 bit field.
+ uint32_t Tgt = MI->getOperand(OpNo).getImm() & ((1 << 6) - 1);
+
+ if (Tgt <= 7)
+ O << " mrt" << Tgt;
+ else if (Tgt == 8)
+ O << " mrtz";
+ else if (Tgt == 9)
+ O << " null";
+ else if (Tgt >= 12 && Tgt <= 15)
+ O << " pos" << Tgt - 12;
+ else if (Tgt >= 32 && Tgt <= 63)
+ O << " param" << Tgt - 32;
+ else {
+ // Reserved values 10, 11
+ O << " invalid_target_" << Tgt;
+ }
+}
+
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned Imm = MI->getOperand(OpNum).getImm();
+ switch (Imm) {
+ case 0:
+ O << "p10";
+ break;
+ case 1:
+ O << "p20";
+ break;
+ case 2:
+ O << "p0";
+ break;
+ default:
+ O << "invalid_param_" << Imm;
+ }
+}
- if (Imm == 2) {
- O << "P0";
- } else if (Imm == 1) {
- O << "P20";
- } else if (Imm == 0) {
- O << "P10";
- } else {
- llvm_unreachable("Invalid interpolation parameter slot");
+void AMDGPUInstPrinter::printInterpAttr(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Attr = MI->getOperand(OpNum).getImm();
+ O << "attr" << Attr;
+}
+
+void AMDGPUInstPrinter::printInterpAttrChan(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Chan = MI->getOperand(OpNum).getImm();
+ O << '.' << "xyzw"[Chan & 0x3];
+}
+
+void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNo).getImm();
+ if (Val == 0) {
+ O << " 0";
+ return;
}
+
+ if (Val & VGPRIndexMode::DST_ENABLE)
+ O << " dst";
+
+ if (Val & VGPRIndexMode::SRC0_ENABLE)
+ O << " src0";
+
+ if (Val & VGPRIndexMode::SRC1_ENABLE)
+ O << " src1";
+
+ if (Val & VGPRIndexMode::SRC2_ENABLE)
+ O << " src2";
}
void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
- printOperand(MI, OpNo, O);
+ printOperand(MI, OpNo, STI, O);
O << ", ";
- printOperand(MI, OpNo + 1, O);
+ printOperand(MI, OpNo + 1, STI, O);
}
void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
@@ -595,23 +814,25 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printIfSet(MI, OpNo, O, '|');
}
void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printIfSet(MI, OpNo, O, "_SAT");
}
void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
if (MI->getOperand(OpNo).getImm())
O << " clamp";
}
void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
int Imm = MI->getOperand(OpNo).getImm();
if (Imm == SIOutMods::MUL2)
O << " mul:2";
@@ -622,6 +843,7 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNo);
assert(Op.isImm() || Op.isExpr());
@@ -635,17 +857,17 @@ void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printIfSet(MI, OpNo, O, "*", " ");
}
void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printIfSet(MI, OpNo, O, '-');
}
void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
switch (MI->getOperand(OpNo).getImm()) {
default: break;
case 1:
@@ -661,22 +883,24 @@ void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
printIfSet(MI, OpNo, O, '+');
}
void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
printIfSet(MI, OpNo, O, "ExecMask,");
}
void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
printIfSet(MI, OpNo, O, "Pred,");
}
void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.getImm() == 0) {
O << " (MASKED)";
@@ -684,7 +908,7 @@ void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ raw_ostream &O) {
const char * chans = "XYZW";
int sel = MI->getOperand(OpNo).getImm();
@@ -708,6 +932,7 @@ void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
int BankSwizzle = MI->getOperand(OpNo).getImm();
switch (BankSwizzle) {
@@ -729,11 +954,10 @@ void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
default:
break;
}
- return;
}
void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
unsigned Sel = MI->getOperand(OpNo).getImm();
switch (Sel) {
case 0:
@@ -763,7 +987,7 @@ void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
unsigned CT = MI->getOperand(OpNo).getImm();
switch (CT) {
case 0:
@@ -778,7 +1002,7 @@ void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
int KCacheMode = MI->getOperand(OpNo).getImm();
if (KCacheMode > 0) {
int KCacheBank = MI->getOperand(OpNo - 2).getImm();
@@ -790,6 +1014,7 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
using namespace llvm::AMDGPU::SendMsg;
@@ -825,32 +1050,34 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
O << "sendmsg(" << IdSymbolic[Id] << ", " << OpSysSymbolic[OpSys] << ')';
return;
}
- } while (0);
+ } while (false);
O << SImm16; // Unknown simm16 code.
}
void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
+ IsaVersion IV = getIsaVersion(STI.getFeatureBits());
+
unsigned SImm16 = MI->getOperand(OpNo).getImm();
- unsigned Vmcnt = SImm16 & 0xF;
- unsigned Expcnt = (SImm16 >> 4) & 0x7;
- unsigned Lgkmcnt = (SImm16 >> 8) & 0xF;
+ unsigned Vmcnt, Expcnt, Lgkmcnt;
+ decodeWaitcnt(IV, SImm16, Vmcnt, Expcnt, Lgkmcnt);
bool NeedSpace = false;
- if (Vmcnt != 0xF) {
+ if (Vmcnt != getVmcntBitMask(IV)) {
O << "vmcnt(" << Vmcnt << ')';
NeedSpace = true;
}
- if (Expcnt != 0x7) {
+ if (Expcnt != getExpcntBitMask(IV)) {
if (NeedSpace)
O << ' ';
O << "expcnt(" << Expcnt << ')';
NeedSpace = true;
}
- if (Lgkmcnt != 0xF) {
+ if (Lgkmcnt != getLgkmcntBitMask(IV)) {
if (NeedSpace)
O << ' ';
O << "lgkmcnt(" << Lgkmcnt << ')';
@@ -858,7 +1085,7 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
using namespace llvm::AMDGPU::Hwreg;
unsigned SImm16 = MI->getOperand(OpNo).getImm();
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index f5a290f..a6d348f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -24,7 +24,8 @@ public:
: MCInstPrinter(MAI, MII, MRI) {}
//Autogenerated by tblgen
- void printInstruction(const MCInst *MI, raw_ostream &O);
+ void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
@@ -33,76 +34,159 @@ public:
const MCRegisterInfo &MRI);
private:
- void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printNamedBit(const MCInst* MI, unsigned OpNo, raw_ostream& O,
+ void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
StringRef BitName);
void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printSMRDOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printDMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printUNorm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printDA(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printR128(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printLWE(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printSMRDOffset8(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSMRDOffset20(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printR128(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printLWE(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpCompr(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpVM(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
void printRegOperand(unsigned RegNo, raw_ostream &O);
- void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printImmediate32(uint32_t I, raw_ostream &O);
- void printImmediate64(uint64_t I, raw_ostream &O);
- void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printDPPCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printRowMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printBankMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printBoundCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printRowMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printBankMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printBoundCtrl(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printSDWADstSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printSDWADstUnused(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
- void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printSDWADstSel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSDWASrc0Sel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSDWASrc1Sel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSDWADstUnused(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printInterpSlot(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printInterpAttr(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printInterpAttrChan(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+ void printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMemOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+
+ template <unsigned N>
+ void printExpSrcN(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpSrc0(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpSrc1(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpSrc2(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpSrc3(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpTgt(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
StringRef Asm, StringRef Default = "");
- static void printIfSet(const MCInst *MI, unsigned OpNo,
- raw_ostream &O, char Asm);
- static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printUpdateExecMask(const MCInst *MI, unsigned OpNo,
- raw_ostream &O);
- static void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- static void printHwreg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+ char Asm);
+ void printAbs(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printClamp(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printClampSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printOModSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printLiteral(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printLast(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printNeg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printOMOD(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printRel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printUpdatePred(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printWrite(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printBankSwizzle(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printRSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printCT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printKCache(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printWaitFlag(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
};
} // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 1cb9d21..ffb92aa 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -13,6 +13,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCValue.h"
@@ -22,30 +23,19 @@ using namespace llvm;
namespace {
-class AMDGPUMCObjectWriter : public MCObjectWriter {
-public:
- AMDGPUMCObjectWriter(raw_pwrite_stream &OS) : MCObjectWriter(OS, true) {}
- void executePostLayoutBinding(MCAssembler &Asm,
- const MCAsmLayout &Layout) override {
- //XXX: Implement if necessary.
- }
- void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
- const MCFragment *Fragment, const MCFixup &Fixup,
- MCValue Target, bool &IsPCRel,
- uint64_t &FixedValue) override {
- assert(!"Not implemented");
- }
-
- void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
-
-};
-
class AMDGPUAsmBackend : public MCAsmBackend {
public:
AMDGPUAsmBackend(const Target &T)
: MCAsmBackend() {}
unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
+
+ void processFixupValue(const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) override;
+
void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
uint64_t Value, bool IsPCRel) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -55,7 +45,7 @@ public:
}
void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
MCInst &Res) const override {
- assert(!"Not implemented");
+ llvm_unreachable("Not implemented");
}
bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
@@ -65,15 +55,10 @@ public:
} //End anonymous namespace
-void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm,
- const MCAsmLayout &Layout) {
- for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) {
- Asm.writeSectionData(&*I, Layout);
- }
-}
-
static unsigned getFixupKindNumBytes(unsigned Kind) {
switch (Kind) {
+ case AMDGPU::fixup_si_sopp_br:
+ return 2;
case FK_SecRel_1:
case FK_Data_1:
return 1;
@@ -92,40 +77,77 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
}
}
+static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+ MCContext *Ctx) {
+ int64_t SignedValue = static_cast<int64_t>(Value);
+
+ switch (Fixup.getKind()) {
+ case AMDGPU::fixup_si_sopp_br: {
+ int64_t BrImm = (SignedValue - 4) / 4;
+
+ if (Ctx && !isInt<16>(BrImm))
+ Ctx->reportError(Fixup.getLoc(), "branch size exceeds simm16");
+
+ return BrImm;
+ }
+ case FK_Data_1:
+ case FK_Data_2:
+ case FK_Data_4:
+ case FK_Data_8:
+ case FK_PCRel_4:
+ case FK_SecRel_4:
+ return Value;
+ default:
+ llvm_unreachable("unhandled fixup kind");
+ }
+}
+
+void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFixup &Fixup, const MCFragment *DF,
+ const MCValue &Target, uint64_t &Value,
+ bool &IsResolved) {
+ MCValue Res;
+
+ // When we have complex expressions like: BB0_1 + (BB0_2 - 4), which are
+ // used for long branches, this function will be called with
+ // IsResolved = false and Value set to some pre-computed value. In
+ // the example above, the value would be:
+ // (BB0_1 + (BB0_2 - 4)) - CurrentOffsetFromStartOfFunction.
+ // This is not what we want. We just want the expression computation
+ // only. The reason the MC layer subtracts the current offset from the
+ // expression is because the fixup is of kind FK_PCRel_4.
+ // For these scenarios, evaluateAsValue gives us the computation that we
+ // want.
+ if (!IsResolved && Fixup.getValue()->evaluateAsValue(Res, Layout) &&
+ Res.isAbsolute()) {
+ Value = Res.getConstant();
+ IsResolved = true;
+
+ }
+ if (IsResolved)
+ Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
+}
+
void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
unsigned DataSize, uint64_t Value,
bool IsPCRel) const {
+ if (!Value)
+ return; // Doesn't change encoding.
- switch ((unsigned)Fixup.getKind()) {
- case AMDGPU::fixup_si_sopp_br: {
- int64_t BrImm = ((int64_t)Value - 4) / 4;
- if (!isInt<16>(BrImm))
- report_fatal_error("branch size exceeds simm16");
-
- uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
- *Dst = BrImm;
- break;
- }
-
- default: {
- // FIXME: Copied from AArch64
- unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
- if (!Value)
- return; // Doesn't change encoding.
- MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
-
- // Shift the value into position.
- Value <<= Info.TargetOffset;
-
- unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-
- // For each byte of the fragment that the fixup touches, mask in the
- // bits from the fixup value.
- for (unsigned i = 0; i != NumBytes; ++i)
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
- }
- }
+ MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+
+ // Shift the value into position.
+ Value <<= Info.TargetOffset;
+
+ unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+ uint32_t Offset = Fixup.getOffset();
+ assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+ // For each byte of the fragment that the fixup touches, mask in the bits from
+ // the fixup value.
+ for (unsigned i = 0; i != NumBytes; ++i)
+ Data[Offset + i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
}
const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
@@ -171,7 +193,8 @@ public:
MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU) {
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options) {
// Use 64-bit ELF for amdgcn
return new ELFAMDGPUAsmBackend(T, TT);
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index b4e3b8e..1847d7a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -38,26 +38,40 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
const MCValue &Target,
const MCFixup &Fixup,
bool IsPCRel) const {
- // SCRATCH_RSRC_DWORD[01] is a special global variable that represents
- // the scratch buffer.
- if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD0")
- return ELF::R_AMDGPU_ABS32_LO;
- if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
- return ELF::R_AMDGPU_ABS32_HI;
+ if (const auto *SymA = Target.getSymA()) {
+ // SCRATCH_RSRC_DWORD[01] is a special global variable that represents
+ // the scratch buffer.
+ if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0")
+ return ELF::R_AMDGPU_ABS32_LO;
+
+ if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
+ return ELF::R_AMDGPU_ABS32_HI;
+ }
switch (Target.getAccessVariant()) {
default:
break;
case MCSymbolRefExpr::VK_GOTPCREL:
return ELF::R_AMDGPU_GOTPCREL;
+ case MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_LO:
+ return ELF::R_AMDGPU_GOTPCREL32_LO;
+ case MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_HI:
+ return ELF::R_AMDGPU_GOTPCREL32_HI;
+ case MCSymbolRefExpr::VK_AMDGPU_REL32_LO:
+ return ELF::R_AMDGPU_REL32_LO;
+ case MCSymbolRefExpr::VK_AMDGPU_REL32_HI:
+ return ELF::R_AMDGPU_REL32_HI;
}
switch (Fixup.getKind()) {
default: break;
case FK_PCRel_4:
return ELF::R_AMDGPU_REL32;
+ case FK_Data_4:
case FK_SecRel_4:
return ELF::R_AMDGPU_ABS32;
+ case FK_Data_8:
+ return ELF::R_AMDGPU_ABS64;
}
llvm_unreachable("unhandled relocation type");
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index c942ea9..3d3858a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -21,11 +21,19 @@
namespace llvm {
class MCInst;
+class MCInstrInfo;
class MCOperand;
class MCSubtargetInfo;
+class FeatureBitset;
class AMDGPUMCCodeEmitter : public MCCodeEmitter {
virtual void anchor();
+
+protected:
+ const MCInstrInfo &MCII;
+
+ AMDGPUMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
+
public:
uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -43,6 +51,11 @@ public:
const MCSubtargetInfo &STI) const {
return 0;
}
+
+protected:
+ uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+ void verifyInstructionPredicates(const MCInst &MI,
+ uint64_t AvailableFeatures) const;
};
} // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index a0d9aab..136e6ec 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -86,7 +86,7 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
}
extern "C" void LLVMInitializeAMDGPUTargetMC() {
- for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) {
+ for (Target *T : {&getTheAMDGPUTarget(), &getTheGCNTarget()}) {
RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo);
@@ -98,14 +98,15 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
}
// R600 specific registration
- TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget,
+ TargetRegistry::RegisterMCCodeEmitter(getTheAMDGPUTarget(),
createR600MCCodeEmitter);
// GCN specific registration
- TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createSIMCCodeEmitter);
+ TargetRegistry::RegisterMCCodeEmitter(getTheGCNTarget(),
+ createSIMCCodeEmitter);
- TargetRegistry::RegisterAsmTargetStreamer(TheGCNTarget,
+ TargetRegistry::RegisterAsmTargetStreamer(getTheGCNTarget(),
createAMDGPUAsmTargetStreamer);
- TargetRegistry::RegisterObjectTargetStreamer(TheGCNTarget,
- createAMDGPUObjectTargetStreamer);
+ TargetRegistry::RegisterObjectTargetStreamer(
+ getTheGCNTarget(), createAMDGPUObjectTargetStreamer);
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 9ab7940..548bad5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -19,7 +19,6 @@
#include "llvm/Support/DataTypes.h"
namespace llvm {
-class StringRef;
class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
@@ -27,13 +26,14 @@ class MCInstrInfo;
class MCObjectWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
+class MCTargetOptions;
+class StringRef;
class Target;
class Triple;
class raw_pwrite_stream;
-class raw_ostream;
-extern Target TheAMDGPUTarget;
-extern Target TheGCNTarget;
+Target &getTheAMDGPUTarget();
+Target &getTheGCNTarget();
MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
@@ -44,7 +44,8 @@ MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
MCContext &Ctx);
MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU);
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit,
bool HasRelocationAddend,
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp
new file mode 100644
index 0000000..95387ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp
@@ -0,0 +1,408 @@
+//===-- AMDGPURuntimeMD.cpp - Generates runtime metadata ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// Generates AMDGPU runtime metadata for YAML mapping.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPURuntimeMetadata.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <vector>
+#include "AMDGPURuntimeMD.h"
+
+using namespace llvm;
+using namespace ::AMDGPU::RuntimeMD;
+
+static cl::opt<bool>
+DumpRuntimeMD("amdgpu-dump-rtmd",
+ cl::desc("Dump AMDGPU runtime metadata"));
+
+static cl::opt<bool>
+CheckRuntimeMDParser("amdgpu-check-rtmd-parser", cl::Hidden,
+ cl::desc("Check AMDGPU runtime metadata YAML parser"));
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint8_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string)
+LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata)
+LLVM_YAML_IS_SEQUENCE_VECTOR(KernelArg::Metadata)
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<KernelArg::Metadata> {
+ static void mapping(IO &YamlIO, KernelArg::Metadata &A) {
+ YamlIO.mapRequired(KeyName::ArgSize, A.Size);
+ YamlIO.mapRequired(KeyName::ArgAlign, A.Align);
+ YamlIO.mapOptional(KeyName::ArgPointeeAlign, A.PointeeAlign, 0U);
+ YamlIO.mapRequired(KeyName::ArgKind, A.Kind);
+ YamlIO.mapRequired(KeyName::ArgValueType, A.ValueType);
+ YamlIO.mapOptional(KeyName::ArgTypeName, A.TypeName, std::string());
+ YamlIO.mapOptional(KeyName::ArgName, A.Name, std::string());
+ YamlIO.mapOptional(KeyName::ArgAddrQual, A.AddrQual, INVALID_ADDR_QUAL);
+ YamlIO.mapOptional(KeyName::ArgAccQual, A.AccQual, INVALID_ACC_QUAL);
+ YamlIO.mapOptional(KeyName::ArgIsVolatile, A.IsVolatile, uint8_t(0));
+ YamlIO.mapOptional(KeyName::ArgIsConst, A.IsConst, uint8_t(0));
+ YamlIO.mapOptional(KeyName::ArgIsRestrict, A.IsRestrict, uint8_t(0));
+ YamlIO.mapOptional(KeyName::ArgIsPipe, A.IsPipe, uint8_t(0));
+ }
+ static const bool flow = true;
+};
+
+template <> struct MappingTraits<Kernel::Metadata> {
+ static void mapping(IO &YamlIO, Kernel::Metadata &K) {
+ YamlIO.mapRequired(KeyName::KernelName, K.Name);
+ YamlIO.mapOptional(KeyName::Language, K.Language, std::string());
+ YamlIO.mapOptional(KeyName::LanguageVersion, K.LanguageVersion);
+ YamlIO.mapOptional(KeyName::ReqdWorkGroupSize, K.ReqdWorkGroupSize);
+ YamlIO.mapOptional(KeyName::WorkGroupSizeHint, K.WorkGroupSizeHint);
+ YamlIO.mapOptional(KeyName::VecTypeHint, K.VecTypeHint, std::string());
+ YamlIO.mapOptional(KeyName::KernelIndex, K.KernelIndex,
+ INVALID_KERNEL_INDEX);
+ YamlIO.mapOptional(KeyName::NoPartialWorkGroups, K.NoPartialWorkGroups,
+ uint8_t(0));
+ YamlIO.mapRequired(KeyName::Args, K.Args);
+ }
+ static const bool flow = true;
+};
+
+template <> struct MappingTraits<Program::Metadata> {
+ static void mapping(IO &YamlIO, Program::Metadata &Prog) {
+ YamlIO.mapRequired(KeyName::MDVersion, Prog.MDVersionSeq);
+ YamlIO.mapOptional(KeyName::PrintfInfo, Prog.PrintfInfo);
+ YamlIO.mapOptional(KeyName::Kernels, Prog.Kernels);
+ }
+ static const bool flow = true;
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+// Get a vector of three integer values from MDNode \p Node;
+static std::vector<uint32_t> getThreeInt32(MDNode *Node) {
+ assert(Node->getNumOperands() == 3);
+ std::vector<uint32_t> V;
+ for (const MDOperand &Op : Node->operands()) {
+ const ConstantInt *CI = mdconst::extract<ConstantInt>(Op);
+ V.push_back(CI->getZExtValue());
+ }
+ return V;
+}
+
+static std::string getOCLTypeName(Type *Ty, bool Signed) {
+ switch (Ty->getTypeID()) {
+ case Type::HalfTyID:
+ return "half";
+ case Type::FloatTyID:
+ return "float";
+ case Type::DoubleTyID:
+ return "double";
+ case Type::IntegerTyID: {
+ if (!Signed)
+ return (Twine('u') + getOCLTypeName(Ty, true)).str();
+ unsigned BW = Ty->getIntegerBitWidth();
+ switch (BW) {
+ case 8:
+ return "char";
+ case 16:
+ return "short";
+ case 32:
+ return "int";
+ case 64:
+ return "long";
+ default:
+ return (Twine('i') + Twine(BW)).str();
+ }
+ }
+ case Type::VectorTyID: {
+ VectorType *VecTy = cast<VectorType>(Ty);
+ Type *EleTy = VecTy->getElementType();
+ unsigned Size = VecTy->getVectorNumElements();
+ return (Twine(getOCLTypeName(EleTy, Signed)) + Twine(Size)).str();
+ }
+ default:
+ return "unknown";
+ }
+}
+
+static KernelArg::ValueType getRuntimeMDValueType(
+ Type *Ty, StringRef TypeName) {
+ switch (Ty->getTypeID()) {
+ case Type::HalfTyID:
+ return KernelArg::F16;
+ case Type::FloatTyID:
+ return KernelArg::F32;
+ case Type::DoubleTyID:
+ return KernelArg::F64;
+ case Type::IntegerTyID: {
+ bool Signed = !TypeName.startswith("u");
+ switch (Ty->getIntegerBitWidth()) {
+ case 8:
+ return Signed ? KernelArg::I8 : KernelArg::U8;
+ case 16:
+ return Signed ? KernelArg::I16 : KernelArg::U16;
+ case 32:
+ return Signed ? KernelArg::I32 : KernelArg::U32;
+ case 64:
+ return Signed ? KernelArg::I64 : KernelArg::U64;
+ default:
+ // Runtime does not recognize other integer types. Report as struct type.
+ return KernelArg::Struct;
+ }
+ }
+ case Type::VectorTyID:
+ return getRuntimeMDValueType(Ty->getVectorElementType(), TypeName);
+ case Type::PointerTyID:
+ return getRuntimeMDValueType(Ty->getPointerElementType(), TypeName);
+ default:
+ return KernelArg::Struct;
+ }
+}
+
+static KernelArg::AddressSpaceQualifer getRuntimeAddrSpace(
+ AMDGPUAS::AddressSpaces A) {
+ switch (A) {
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ return KernelArg::Global;
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ return KernelArg::Constant;
+ case AMDGPUAS::LOCAL_ADDRESS:
+ return KernelArg::Local;
+ case AMDGPUAS::FLAT_ADDRESS:
+ return KernelArg::Generic;
+ case AMDGPUAS::REGION_ADDRESS:
+ return KernelArg::Region;
+ default:
+ return KernelArg::Private;
+ }
+}
+
+static KernelArg::Metadata getRuntimeMDForKernelArg(const DataLayout &DL,
+ Type *T, KernelArg::Kind Kind, StringRef BaseTypeName = "",
+ StringRef TypeName = "", StringRef ArgName = "", StringRef TypeQual = "",
+ StringRef AccQual = "") {
+
+ KernelArg::Metadata Arg;
+
+ // Set ArgSize and ArgAlign.
+ Arg.Size = DL.getTypeAllocSize(T);
+ Arg.Align = DL.getABITypeAlignment(T);
+ if (auto PT = dyn_cast<PointerType>(T)) {
+ auto ET = PT->getElementType();
+ if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && ET->isSized())
+ Arg.PointeeAlign = DL.getABITypeAlignment(ET);
+ }
+
+ // Set ArgTypeName.
+ Arg.TypeName = TypeName;
+
+ // Set ArgName.
+ Arg.Name = ArgName;
+
+ // Set ArgIsVolatile, ArgIsRestrict, ArgIsConst and ArgIsPipe.
+ SmallVector<StringRef, 1> SplitQ;
+ TypeQual.split(SplitQ, " ", -1, false /* Drop empty entry */);
+
+ for (StringRef KeyName : SplitQ) {
+ auto *P = StringSwitch<uint8_t *>(KeyName)
+ .Case("volatile", &Arg.IsVolatile)
+ .Case("restrict", &Arg.IsRestrict)
+ .Case("const", &Arg.IsConst)
+ .Case("pipe", &Arg.IsPipe)
+ .Default(nullptr);
+ if (P)
+ *P = 1;
+ }
+
+ // Set ArgKind.
+ Arg.Kind = Kind;
+
+ // Set ArgValueType.
+ Arg.ValueType = getRuntimeMDValueType(T, BaseTypeName);
+
+ // Set ArgAccQual.
+ if (!AccQual.empty()) {
+ Arg.AccQual = StringSwitch<KernelArg::AccessQualifer>(AccQual)
+ .Case("read_only", KernelArg::ReadOnly)
+ .Case("write_only", KernelArg::WriteOnly)
+ .Case("read_write", KernelArg::ReadWrite)
+ .Default(KernelArg::AccNone);
+ }
+
+ // Set ArgAddrQual.
+ if (auto *PT = dyn_cast<PointerType>(T)) {
+ Arg.AddrQual = getRuntimeAddrSpace(static_cast<AMDGPUAS::AddressSpaces>(
+ PT->getAddressSpace()));
+ }
+
+ return Arg;
+}
+
+static Kernel::Metadata getRuntimeMDForKernel(const Function &F) {
+ Kernel::Metadata Kernel;
+ Kernel.Name = F.getName();
+ auto &M = *F.getParent();
+
+ // Set Language and LanguageVersion.
+ if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
+ if (MD->getNumOperands() != 0) {
+ auto Node = MD->getOperand(0);
+ if (Node->getNumOperands() > 1) {
+ Kernel.Language = "OpenCL C";
+ uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
+ ->getZExtValue();
+ uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
+ ->getZExtValue();
+ Kernel.LanguageVersion.push_back(Major);
+ Kernel.LanguageVersion.push_back(Minor);
+ }
+ }
+ }
+
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ for (auto &Arg : F.args()) {
+ unsigned I = Arg.getArgNo();
+ Type *T = Arg.getType();
+ auto TypeName = dyn_cast<MDString>(F.getMetadata(
+ "kernel_arg_type")->getOperand(I))->getString();
+ auto BaseTypeName = cast<MDString>(F.getMetadata(
+ "kernel_arg_base_type")->getOperand(I))->getString();
+ StringRef ArgName;
+ if (auto ArgNameMD = F.getMetadata("kernel_arg_name"))
+ ArgName = cast<MDString>(ArgNameMD->getOperand(I))->getString();
+ auto TypeQual = cast<MDString>(F.getMetadata(
+ "kernel_arg_type_qual")->getOperand(I))->getString();
+ auto AccQual = cast<MDString>(F.getMetadata(
+ "kernel_arg_access_qual")->getOperand(I))->getString();
+ KernelArg::Kind Kind;
+ if (TypeQual.find("pipe") != StringRef::npos)
+ Kind = KernelArg::Pipe;
+ else Kind = StringSwitch<KernelArg::Kind>(BaseTypeName)
+ .Case("sampler_t", KernelArg::Sampler)
+ .Case("queue_t", KernelArg::Queue)
+ .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t",
+ "image2d_t" , "image2d_array_t", KernelArg::Image)
+ .Cases("image2d_depth_t", "image2d_array_depth_t",
+ "image2d_msaa_t", "image2d_array_msaa_t",
+ "image2d_msaa_depth_t", KernelArg::Image)
+ .Cases("image2d_array_msaa_depth_t", "image3d_t",
+ KernelArg::Image)
+ .Default(isa<PointerType>(T) ?
+ (T->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ?
+ KernelArg::DynamicSharedPointer :
+ KernelArg::GlobalBuffer) :
+ KernelArg::ByValue);
+ Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, T, Kind,
+ BaseTypeName, TypeName, ArgName, TypeQual, AccQual));
+ }
+
+ // Emit hidden kernel arguments for OpenCL kernels.
+ if (F.getParent()->getNamedMetadata("opencl.ocl.version")) {
+ auto Int64T = Type::getInt64Ty(F.getContext());
+ Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
+ KernelArg::HiddenGlobalOffsetX));
+ Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
+ KernelArg::HiddenGlobalOffsetY));
+ Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
+ KernelArg::HiddenGlobalOffsetZ));
+ if (F.getParent()->getNamedMetadata("llvm.printf.fmts")) {
+ auto Int8PtrT = Type::getInt8PtrTy(F.getContext(),
+ KernelArg::Global);
+ Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int8PtrT,
+ KernelArg::HiddenPrintfBuffer));
+ }
+ }
+
+ // Set ReqdWorkGroupSize, WorkGroupSizeHint, and VecTypeHint.
+ if (auto RWGS = F.getMetadata("reqd_work_group_size"))
+ Kernel.ReqdWorkGroupSize = getThreeInt32(RWGS);
+
+ if (auto WGSH = F.getMetadata("work_group_size_hint"))
+ Kernel.WorkGroupSizeHint = getThreeInt32(WGSH);
+
+ if (auto VTH = F.getMetadata("vec_type_hint"))
+ Kernel.VecTypeHint = getOCLTypeName(cast<ValueAsMetadata>(
+ VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>(
+ VTH->getOperand(1))->getZExtValue());
+
+ return Kernel;
+}
+
+Program::Metadata::Metadata(const std::string &YAML) {
+ yaml::Input Input(YAML);
+ Input >> *this;
+}
+
+std::string Program::Metadata::toYAML(void) {
+ std::string Text;
+ raw_string_ostream Stream(Text);
+ yaml::Output Output(Stream, nullptr, INT_MAX /* do not wrap line */);
+ Output << *this;
+ return Stream.str();
+}
+
+Program::Metadata Program::Metadata::fromYAML(const std::string &S) {
+ return Program::Metadata(S);
+}
+
+// Check if the YAML string can be parsed.
+static void checkRuntimeMDYAMLString(const std::string &YAML) {
+ auto P = Program::Metadata::fromYAML(YAML);
+ auto S = P.toYAML();
+ llvm::errs() << "AMDGPU runtime metadata parser test "
+ << (YAML == S ? "passes" : "fails") << ".\n";
+ if (YAML != S) {
+ llvm::errs() << "First output: " << YAML << '\n'
+ << "Second output: " << S << '\n';
+ }
+}
+
+std::string llvm::getRuntimeMDYAMLString(Module &M) {
+ Program::Metadata Prog;
+ Prog.MDVersionSeq.push_back(MDVersion);
+ Prog.MDVersionSeq.push_back(MDRevision);
+
+ // Set PrintfInfo.
+ if (auto MD = M.getNamedMetadata("llvm.printf.fmts")) {
+ for (unsigned I = 0; I < MD->getNumOperands(); ++I) {
+ auto Node = MD->getOperand(I);
+ if (Node->getNumOperands() > 0)
+ Prog.PrintfInfo.push_back(cast<MDString>(Node->getOperand(0))
+ ->getString());
+ }
+ }
+
+ // Set Kernels.
+ for (auto &F: M.functions()) {
+ if (!F.getMetadata("kernel_arg_type"))
+ continue;
+ Prog.Kernels.emplace_back(getRuntimeMDForKernel(F));
+ }
+
+ auto YAML = Prog.toYAML();
+
+ if (DumpRuntimeMD)
+ llvm::errs() << "AMDGPU runtime metadata:\n" << YAML << '\n';
+
+ if (CheckRuntimeMDParser)
+ checkRuntimeMDYAMLString(YAML);
+
+ return YAML;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h
new file mode 100644
index 0000000..a92fdd4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h
@@ -0,0 +1,26 @@
+//===- AMDGPURuntimeMD.h - Generate runtime metadata ---------------*- C++ -*-//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares functions for generating runtime metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H
+
+#include <string>
+
+namespace llvm {
+class Module;
+
+// Get runtime metadata as YAML string.
+std::string getRuntimeMDYAMLString(Module &M);
+
+}
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 83dcaac..3392183 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -11,21 +11,33 @@
//
//===----------------------------------------------------------------------===//
+#include "AMDGPU.h"
#include "AMDGPUTargetStreamer.h"
#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDKernelCodeTUtils.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/Support/ELF.h"
#include "llvm/Support/FormattedStream.h"
+#include "AMDGPURuntimeMD.h"
+
+namespace llvm {
+#include "AMDGPUPTNote.h"
+}
using namespace llvm;
+using namespace llvm::AMDGPU;
AMDGPUTargetStreamer::AMDGPUTargetStreamer(MCStreamer &S)
- : MCTargetStreamer(S) { }
+ : MCTargetStreamer(S) {}
//===----------------------------------------------------------------------===//
// AMDGPUTargetAsmStreamer
@@ -56,169 +68,9 @@ AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
void
AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
- uint64_t ComputePgmRsrc2 = (Header.compute_pgm_resource_registers >> 32);
- bool EnableSGPRPrivateSegmentBuffer = (Header.code_properties &
- AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
- bool EnableSGPRDispatchPtr = (Header.code_properties &
- AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
- bool EnableSGPRQueuePtr = (Header.code_properties &
- AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
- bool EnableSGPRKernargSegmentPtr = (Header.code_properties &
- AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
- bool EnableSGPRDispatchID = (Header.code_properties &
- AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
- bool EnableSGPRFlatScratchInit = (Header.code_properties &
- AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
- bool EnableSGPRPrivateSegmentSize = (Header.code_properties &
- AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
- bool EnableSGPRGridWorkgroupCountX = (Header.code_properties &
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X);
- bool EnableSGPRGridWorkgroupCountY = (Header.code_properties &
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y);
- bool EnableSGPRGridWorkgroupCountZ = (Header.code_properties &
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z);
- bool EnableOrderedAppendGDS = (Header.code_properties &
- AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS);
- uint32_t PrivateElementSize = (Header.code_properties &
- AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE) >>
- AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT;
- bool IsPtr64 = (Header.code_properties & AMD_CODE_PROPERTY_IS_PTR64);
- bool IsDynamicCallstack = (Header.code_properties &
- AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK);
- bool IsDebugEnabled = (Header.code_properties &
- AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED);
- bool IsXNackEnabled = (Header.code_properties &
- AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED);
-
- OS << "\t.amd_kernel_code_t\n" <<
- "\t\tkernel_code_version_major = " <<
- Header.amd_kernel_code_version_major << '\n' <<
- "\t\tkernel_code_version_minor = " <<
- Header.amd_kernel_code_version_minor << '\n' <<
- "\t\tmachine_kind = " <<
- Header.amd_machine_kind << '\n' <<
- "\t\tmachine_version_major = " <<
- Header.amd_machine_version_major << '\n' <<
- "\t\tmachine_version_minor = " <<
- Header.amd_machine_version_minor << '\n' <<
- "\t\tmachine_version_stepping = " <<
- Header.amd_machine_version_stepping << '\n' <<
- "\t\tkernel_code_entry_byte_offset = " <<
- Header.kernel_code_entry_byte_offset << '\n' <<
- "\t\tkernel_code_prefetch_byte_size = " <<
- Header.kernel_code_prefetch_byte_size << '\n' <<
- "\t\tmax_scratch_backing_memory_byte_size = " <<
- Header.max_scratch_backing_memory_byte_size << '\n' <<
- "\t\tcompute_pgm_rsrc1_vgprs = " <<
- G_00B848_VGPRS(Header.compute_pgm_resource_registers) << '\n' <<
- "\t\tcompute_pgm_rsrc1_sgprs = " <<
- G_00B848_SGPRS(Header.compute_pgm_resource_registers) << '\n' <<
- "\t\tcompute_pgm_rsrc1_priority = " <<
- G_00B848_PRIORITY(Header.compute_pgm_resource_registers) << '\n' <<
- "\t\tcompute_pgm_rsrc1_float_mode = " <<
- G_00B848_FLOAT_MODE(Header.compute_pgm_resource_registers) << '\n' <<
- "\t\tcompute_pgm_rsrc1_priv = " <<
- G_00B848_PRIV(Header.compute_pgm_resource_registers) << '\n' <<
- "\t\tcompute_pgm_rsrc1_dx10_clamp = " <<
- G_00B848_DX10_CLAMP(Header.compute_pgm_resource_registers) << '\n' <<
- "\t\tcompute_pgm_rsrc1_debug_mode = " <<
- G_00B848_DEBUG_MODE(Header.compute_pgm_resource_registers) << '\n' <<
- "\t\tcompute_pgm_rsrc1_ieee_mode = " <<
- G_00B848_IEEE_MODE(Header.compute_pgm_resource_registers) << '\n' <<
- "\t\tcompute_pgm_rsrc2_scratch_en = " <<
- G_00B84C_SCRATCH_EN(ComputePgmRsrc2) << '\n' <<
- "\t\tcompute_pgm_rsrc2_user_sgpr = " <<
- G_00B84C_USER_SGPR(ComputePgmRsrc2) << '\n' <<
- "\t\tcompute_pgm_rsrc2_tgid_x_en = " <<
- G_00B84C_TGID_X_EN(ComputePgmRsrc2) << '\n' <<
- "\t\tcompute_pgm_rsrc2_tgid_y_en = " <<
- G_00B84C_TGID_Y_EN(ComputePgmRsrc2) << '\n' <<
- "\t\tcompute_pgm_rsrc2_tgid_z_en = " <<
- G_00B84C_TGID_Z_EN(ComputePgmRsrc2) << '\n' <<
- "\t\tcompute_pgm_rsrc2_tg_size_en = " <<
- G_00B84C_TG_SIZE_EN(ComputePgmRsrc2) << '\n' <<
- "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = " <<
- G_00B84C_TIDIG_COMP_CNT(ComputePgmRsrc2) << '\n' <<
- "\t\tcompute_pgm_rsrc2_excp_en_msb = " <<
- G_00B84C_EXCP_EN_MSB(ComputePgmRsrc2) << '\n' <<
- "\t\tcompute_pgm_rsrc2_lds_size = " <<
- G_00B84C_LDS_SIZE(ComputePgmRsrc2) << '\n' <<
- "\t\tcompute_pgm_rsrc2_excp_en = " <<
- G_00B84C_EXCP_EN(ComputePgmRsrc2) << '\n' <<
-
- "\t\tenable_sgpr_private_segment_buffer = " <<
- EnableSGPRPrivateSegmentBuffer << '\n' <<
- "\t\tenable_sgpr_dispatch_ptr = " <<
- EnableSGPRDispatchPtr << '\n' <<
- "\t\tenable_sgpr_queue_ptr = " <<
- EnableSGPRQueuePtr << '\n' <<
- "\t\tenable_sgpr_kernarg_segment_ptr = " <<
- EnableSGPRKernargSegmentPtr << '\n' <<
- "\t\tenable_sgpr_dispatch_id = " <<
- EnableSGPRDispatchID << '\n' <<
- "\t\tenable_sgpr_flat_scratch_init = " <<
- EnableSGPRFlatScratchInit << '\n' <<
- "\t\tenable_sgpr_private_segment_size = " <<
- EnableSGPRPrivateSegmentSize << '\n' <<
- "\t\tenable_sgpr_grid_workgroup_count_x = " <<
- EnableSGPRGridWorkgroupCountX << '\n' <<
- "\t\tenable_sgpr_grid_workgroup_count_y = " <<
- EnableSGPRGridWorkgroupCountY << '\n' <<
- "\t\tenable_sgpr_grid_workgroup_count_z = " <<
- EnableSGPRGridWorkgroupCountZ << '\n' <<
- "\t\tenable_ordered_append_gds = " <<
- EnableOrderedAppendGDS << '\n' <<
- "\t\tprivate_element_size = " <<
- PrivateElementSize << '\n' <<
- "\t\tis_ptr64 = " <<
- IsPtr64 << '\n' <<
- "\t\tis_dynamic_callstack = " <<
- IsDynamicCallstack << '\n' <<
- "\t\tis_debug_enabled = " <<
- IsDebugEnabled << '\n' <<
- "\t\tis_xnack_enabled = " <<
- IsXNackEnabled << '\n' <<
- "\t\tworkitem_private_segment_byte_size = " <<
- Header.workitem_private_segment_byte_size << '\n' <<
- "\t\tworkgroup_group_segment_byte_size = " <<
- Header.workgroup_group_segment_byte_size << '\n' <<
- "\t\tgds_segment_byte_size = " <<
- Header.gds_segment_byte_size << '\n' <<
- "\t\tkernarg_segment_byte_size = " <<
- Header.kernarg_segment_byte_size << '\n' <<
- "\t\tworkgroup_fbarrier_count = " <<
- Header.workgroup_fbarrier_count << '\n' <<
- "\t\twavefront_sgpr_count = " <<
- Header.wavefront_sgpr_count << '\n' <<
- "\t\tworkitem_vgpr_count = " <<
- Header.workitem_vgpr_count << '\n' <<
- "\t\treserved_vgpr_first = " <<
- Header.reserved_vgpr_first << '\n' <<
- "\t\treserved_vgpr_count = " <<
- Header.reserved_vgpr_count << '\n' <<
- "\t\treserved_sgpr_first = " <<
- Header.reserved_sgpr_first << '\n' <<
- "\t\treserved_sgpr_count = " <<
- Header.reserved_sgpr_count << '\n' <<
- "\t\tdebug_wavefront_private_segment_offset_sgpr = " <<
- Header.debug_wavefront_private_segment_offset_sgpr << '\n' <<
- "\t\tdebug_private_segment_buffer_sgpr = " <<
- Header.debug_private_segment_buffer_sgpr << '\n' <<
- "\t\tkernarg_segment_alignment = " <<
- (uint32_t)Header.kernarg_segment_alignment << '\n' <<
- "\t\tgroup_segment_alignment = " <<
- (uint32_t)Header.group_segment_alignment << '\n' <<
- "\t\tprivate_segment_alignment = " <<
- (uint32_t)Header.private_segment_alignment << '\n' <<
- "\t\twavefront_size = " <<
- (uint32_t)Header.wavefront_size << '\n' <<
- "\t\tcall_convention = " <<
- Header.call_convention << '\n' <<
- "\t\truntime_loader_kernel_symbol = " <<
- Header.runtime_loader_kernel_symbol << '\n' <<
- // TODO: control_directives
- "\t.end_amd_kernel_code_t\n";
-
+ OS << "\t.amd_kernel_code_t\n";
+ dumpAmdKernelCode(&Header, OS, "\t\t");
+ OS << "\t.end_amd_kernel_code_t\n";
}
void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
@@ -241,35 +93,63 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal(
OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n';
}
+void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(Module &M) {
+ OS << "\t.amdgpu_runtime_metadata\n";
+ OS << getRuntimeMDYAMLString(M);
+ OS << "\n\t.end_amdgpu_runtime_metadata\n";
+}
+
+void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(StringRef Metadata) {
+ OS << "\t.amdgpu_runtime_metadata";
+ OS << Metadata;
+ OS << "\t.end_amdgpu_runtime_metadata\n";
+}
+
//===----------------------------------------------------------------------===//
// AMDGPUTargetELFStreamer
//===----------------------------------------------------------------------===//
AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S)
- : AMDGPUTargetStreamer(S), Streamer(S) { }
+ : AMDGPUTargetStreamer(S), Streamer(S) {}
MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
return static_cast<MCELFStreamer &>(Streamer);
}
void
+AMDGPUTargetELFStreamer::EmitAMDGPUNote(const MCExpr* DescSZ,
+ PT_NOTE::NoteType Type,
+ std::function<void(MCELFStreamer &)> EmitDesc) {
+ auto &S = getStreamer();
+ auto &Context = S.getContext();
+
+ auto NameSZ = sizeof(PT_NOTE::NoteName);
+
+ S.PushSection();
+ S.SwitchSection(Context.getELFSection(
+ PT_NOTE::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
+ S.EmitIntValue(NameSZ, 4); // namesz
+ S.EmitValue(DescSZ, 4); // descz
+ S.EmitIntValue(Type, 4); // type
+ S.EmitBytes(StringRef(PT_NOTE::NoteName, NameSZ)); // name
+ S.EmitValueToAlignment(4, 0, 1, 0); // padding 0
+ EmitDesc(S); // desc
+ S.EmitValueToAlignment(4, 0, 1, 0); // padding 0
+ S.PopSection();
+}
+
+void
AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) {
- MCStreamer &OS = getStreamer();
- MCSectionELF *Note = OS.getContext().getELFSection(".note", ELF::SHT_NOTE, 0);
-
- unsigned NameSZ = 4;
- OS.PushSection();
- OS.SwitchSection(Note);
- OS.EmitIntValue(NameSZ, 4); // namesz
- OS.EmitIntValue(8, 4); // descz
- OS.EmitIntValue(NT_AMDGPU_HSA_CODE_OBJECT_VERSION, 4); // type
- OS.EmitBytes(StringRef("AMD", NameSZ)); // name
- OS.EmitIntValue(Major, 4); // desc
- OS.EmitIntValue(Minor, 4);
- OS.EmitValueToAlignment(4);
- OS.PopSection();
+ EmitAMDGPUNote(
+ MCConstantExpr::create(8, getContext()),
+ PT_NOTE::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
+ [&](MCELFStreamer &OS){
+ OS.EmitIntValue(Major, 4);
+ OS.EmitIntValue(Minor, 4);
+ }
+ );
}
void
@@ -278,33 +158,28 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
uint32_t Stepping,
StringRef VendorName,
StringRef ArchName) {
- MCStreamer &OS = getStreamer();
- MCSectionELF *Note = OS.getContext().getELFSection(".note", ELF::SHT_NOTE, 0);
-
- unsigned NameSZ = 4;
uint16_t VendorNameSize = VendorName.size() + 1;
uint16_t ArchNameSize = ArchName.size() + 1;
+
unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) +
- sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
- VendorNameSize + ArchNameSize;
-
- OS.PushSection();
- OS.SwitchSection(Note);
- OS.EmitIntValue(NameSZ, 4); // namesz
- OS.EmitIntValue(DescSZ, 4); // descsz
- OS.EmitIntValue(NT_AMDGPU_HSA_ISA, 4); // type
- OS.EmitBytes(StringRef("AMD", 4)); // name
- OS.EmitIntValue(VendorNameSize, 2); // desc
- OS.EmitIntValue(ArchNameSize, 2);
- OS.EmitIntValue(Major, 4);
- OS.EmitIntValue(Minor, 4);
- OS.EmitIntValue(Stepping, 4);
- OS.EmitBytes(VendorName);
- OS.EmitIntValue(0, 1); // NULL terminate VendorName
- OS.EmitBytes(ArchName);
- OS.EmitIntValue(0, 1); // NULL terminte ArchName
- OS.EmitValueToAlignment(4);
- OS.PopSection();
+ sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
+ VendorNameSize + ArchNameSize;
+
+ EmitAMDGPUNote(
+ MCConstantExpr::create(DescSZ, getContext()),
+ PT_NOTE::NT_AMDGPU_HSA_ISA,
+ [&](MCELFStreamer &OS) {
+ OS.EmitIntValue(VendorNameSize, 2);
+ OS.EmitIntValue(ArchNameSize, 2);
+ OS.EmitIntValue(Major, 4);
+ OS.EmitIntValue(Minor, 4);
+ OS.EmitIntValue(Stepping, 4);
+ OS.EmitBytes(VendorName);
+ OS.EmitIntValue(0, 1); // NULL terminate VendorName
+ OS.EmitBytes(ArchName);
+ OS.EmitIntValue(0, 1); // NULL terminte ArchName
+ }
+ );
}
void
@@ -340,3 +215,28 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal(
Symbol->setType(ELF::STT_OBJECT);
Symbol->setBinding(ELF::STB_GLOBAL);
}
+
+void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) {
+ // Create two labels to mark the beginning and end of the desc field
+ // and a MCExpr to calculate the size of the desc field.
+ auto &Context = getContext();
+ auto *DescBegin = Context.createTempSymbol();
+ auto *DescEnd = Context.createTempSymbol();
+ auto *DescSZ = MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(DescEnd, Context),
+ MCSymbolRefExpr::create(DescBegin, Context), Context);
+
+ EmitAMDGPUNote(
+ DescSZ,
+ PT_NOTE::NT_AMDGPU_HSA_RUNTIME_METADATA,
+ [&](MCELFStreamer &OS) {
+ OS.EmitLabel(DescBegin);
+ OS.EmitBytes(Metadata);
+ OS.EmitLabel(DescEnd);
+ }
+ );
+}
+
+void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(Module &M) {
+ EmitRuntimeMetadata(getRuntimeMDYAMLString(M));
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index b3d59e8..e2f2058 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -14,11 +14,20 @@
#include "llvm/MC/MCStreamer.h"
namespace llvm {
+#include "AMDGPUPTNote.h"
+class DataLayout;
+class Function;
class MCELFStreamer;
class MCSymbol;
+class MDNode;
+class Module;
+class Type;
class AMDGPUTargetStreamer : public MCTargetStreamer {
+protected:
+ MCContext &getContext() const { return Streamer.getContext(); }
+
public:
AMDGPUTargetStreamer(MCStreamer &S);
virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
@@ -36,6 +45,10 @@ public:
virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0;
virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0;
+
+ virtual void EmitRuntimeMetadata(Module &M) = 0;
+
+ virtual void EmitRuntimeMetadata(StringRef Metadata) = 0;
};
class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
@@ -56,23 +69,19 @@ public:
void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
-};
-class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
+ void EmitRuntimeMetadata(Module &M) override;
- enum NoteType {
- NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1,
- NT_AMDGPU_HSA_HSAIL = 2,
- NT_AMDGPU_HSA_ISA = 3,
- NT_AMDGPU_HSA_PRODUCER = 4,
- NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5,
- NT_AMDGPU_HSA_EXTENSION = 6,
- NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101,
- NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
- };
+ void EmitRuntimeMetadata(StringRef Metadata) override;
+};
+class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
MCStreamer &Streamer;
+ void EmitAMDGPUNote(const MCExpr* DescSize,
+ AMDGPU::PT_NOTE::NoteType Type,
+ std::function<void(MCELFStreamer &)> EmitDesc);
+
public:
AMDGPUTargetELFStreamer(MCStreamer &S);
@@ -92,6 +101,10 @@ public:
void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
+
+ void EmitRuntimeMetadata(Module &M) override;
+
+ void EmitRuntimeMetadata(StringRef Metadata) override;
};
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 5e8e6ce..6015ec1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -20,26 +20,30 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
#include "llvm/Support/EndianStream.h"
#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
using namespace llvm;
namespace {
class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
- R600MCCodeEmitter(const R600MCCodeEmitter &) = delete;
- void operator=(const R600MCCodeEmitter &) = delete;
- const MCInstrInfo &MCII;
const MCRegisterInfo &MRI;
public:
R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
- : MCII(mcii), MRI(mri) { }
+ : AMDGPUMCCodeEmitter(mcii), MRI(mri) {}
+ R600MCCodeEmitter(const R600MCCodeEmitter &) = delete;
+ R600MCCodeEmitter &operator=(const R600MCCodeEmitter &) = delete;
/// \brief Encode the instruction and write it to the OS.
void encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -58,7 +62,7 @@ private:
unsigned getHWReg(unsigned regNo) const;
};
-} // End anonymous namespace
+} // end anonymous namespace
enum RegElement {
ELEMENT_X = 0,
@@ -86,6 +90,9 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
+ verifyInstructionPredicates(MI,
+ computeAvailableFeatures(STI.getFeatureBits()));
+
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
if (MI.getOpcode() == AMDGPU::RETURN ||
MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
@@ -178,4 +185,5 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
return MO.getImm();
}
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 71b585c..0c5bb06 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===//
+//===-- SIMCCodeEmitter.cpp - SI Code Emitter -----------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -17,38 +17,42 @@
#include "MCTargetDesc/AMDGPUFixupKinds.h"
#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
using namespace llvm;
namespace {
class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
- SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
- void operator=(const SIMCCodeEmitter &) = delete;
- const MCInstrInfo &MCII;
const MCRegisterInfo &MRI;
- /// \brief Can this operand also contain immediate values?
- bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
-
/// \brief Encode an fp or int literal
- uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const;
+ uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
+ const MCSubtargetInfo &STI) const;
public:
SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
MCContext &ctx)
- : MCII(mcii), MRI(mri) { }
-
- ~SIMCCodeEmitter() override {}
+ : AMDGPUMCCodeEmitter(mcii), MRI(mri) {}
+ SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
+ SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete;
/// \brief Encode the instruction and write it to the OS.
void encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -67,7 +71,7 @@ public:
const MCSubtargetInfo &STI) const override;
};
-} // End anonymous namespace
+} // end anonymous namespace
MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
@@ -75,14 +79,6 @@ MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
return new SIMCCodeEmitter(MCII, MRI, Ctx);
}
-bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
- unsigned OpNo) const {
- unsigned OpType = Desc.OpInfo[OpNo].OperandType;
-
- return OpType == AMDGPU::OPERAND_REG_IMM32 ||
- OpType == AMDGPU::OPERAND_REG_INLINE_C;
-}
-
// Returns the encoding value to use if the given integer is an integer inline
// immediate value, or 0 if it is not.
template <typename IntTy>
@@ -96,7 +92,43 @@ static uint32_t getIntInlineImmEncoding(IntTy Imm) {
return 0;
}
-static uint32_t getLit32Encoding(uint32_t Val) {
+static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) {
+ uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
+ if (IntImm != 0)
+ return IntImm;
+
+ if (Val == 0x3800) // 0.5
+ return 240;
+
+ if (Val == 0xB800) // -0.5
+ return 241;
+
+ if (Val == 0x3C00) // 1.0
+ return 242;
+
+ if (Val == 0xBC00) // -1.0
+ return 243;
+
+ if (Val == 0x4000) // 2.0
+ return 244;
+
+ if (Val == 0xC000) // -2.0
+ return 245;
+
+ if (Val == 0x4400) // 4.0
+ return 246;
+
+ if (Val == 0xC400) // -4.0
+ return 247;
+
+ if (Val == 0x3118 && // 1.0 / (2.0 * pi)
+ STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+ return 248;
+
+ return 255;
+}
+
+static uint32_t getLit32Encoding(uint32_t Val, const MCSubtargetInfo &STI) {
uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val));
if (IntImm != 0)
return IntImm;
@@ -125,10 +157,14 @@ static uint32_t getLit32Encoding(uint32_t Val) {
if (Val == FloatToBits(-4.0f))
return 247;
+ if (Val == 0x3e22f983 && // 1.0 / (2.0 * pi)
+ STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+ return 248;
+
return 255;
}
-static uint32_t getLit64Encoding(uint64_t Val) {
+static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI) {
uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val));
if (IntImm != 0)
return IntImm;
@@ -157,15 +193,19 @@ static uint32_t getLit64Encoding(uint64_t Val) {
if (Val == DoubleToBits(-4.0))
return 247;
+ if (Val == 0x3fc45f306dc9c882 && // 1.0 / (2.0 * pi)
+ STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+ return 248;
+
return 255;
}
uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
- unsigned OpSize) const {
-
+ const MCOperandInfo &OpInfo,
+ const MCSubtargetInfo &STI) const {
int64_t Imm;
if (MO.isExpr()) {
- const MCConstantExpr *C = dyn_cast<MCConstantExpr>(MO.getExpr());
+ const auto *C = dyn_cast<MCConstantExpr>(MO.getExpr());
if (!C)
return 255;
@@ -180,17 +220,23 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
Imm = MO.getImm();
}
- if (OpSize == 4)
- return getLit32Encoding(static_cast<uint32_t>(Imm));
-
- assert(OpSize == 8);
-
- return getLit64Encoding(static_cast<uint64_t>(Imm));
+ switch (AMDGPU::getOperandSize(OpInfo)) {
+ case 4:
+ return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
+ case 8:
+ return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
+ case 2:
+ return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
+ default:
+ llvm_unreachable("invalid operand size");
+ }
}
void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
+ verifyInstructionPredicates(MI,
+ computeAvailableFeatures(STI.getFeatureBits()));
uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI);
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
@@ -207,15 +253,12 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
// Check if this operand should be encoded as [SV]Src
- if (!isSrcOperand(Desc, i))
+ if (!AMDGPU::isSISrcOperand(Desc, i))
continue;
- int RCID = Desc.OpInfo[i].RegClass;
- const MCRegisterClass &RC = MRI.getRegClass(RCID);
-
// Is this operand a literal immediate?
const MCOperand &Op = MI.getOperand(i);
- if (getLitEncoding(Op, RC.getSize()) != 255)
+ if (getLitEncoding(Op, Desc.OpInfo[i], STI) != 255)
continue;
// Yes! Encode it
@@ -224,7 +267,7 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
if (Op.isImm())
Imm = Op.getImm();
else if (Op.isExpr()) {
- if (const MCConstantExpr *C = dyn_cast<MCConstantExpr>(Op.getExpr()))
+ if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr()))
Imm = C->getValue();
} else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
@@ -262,7 +305,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
return MRI.getEncodingValue(MO.getReg());
if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
- const MCSymbolRefExpr *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+ const auto *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
MCFixupKind Kind;
if (Expr && Expr->getSymbol().isExternal())
Kind = FK_Data_4;
@@ -279,11 +322,8 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
}
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
- if (isSrcOperand(Desc, OpNo)) {
- int RCID = Desc.OpInfo[OpNo].RegClass;
- const MCRegisterClass &RC = MRI.getRegClass(RCID);
-
- uint32_t Enc = getLitEncoding(MO, RC.getSize());
+ if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
+ uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
return Enc;
@@ -293,4 +333,3 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
llvm_unreachable("Encoding of this operand type is not supported yet.");
return 0;
}
-
diff --git a/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td
new file mode 100644
index 0000000..46803e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -0,0 +1,763 @@
+//===-- MIMGInstructions.td - MIMG Instruction Defintions -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class MIMG_Mask <string op, int channels> {
+ string Op = op;
+ int Channels = channels;
+}
+
+class mimg <bits<7> si, bits<7> vi = si> {
+ field bits<7> SI = si;
+ field bits<7> VI = vi;
+}
+
+class MIMG_Helper <dag outs, dag ins, string asm,
+ string dns=""> : MIMG<outs, ins, asm,[]> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasPostISelHook = 1;
+ let DecoderNamespace = dns;
+ let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
+ let AsmMatchConverter = "cvtMIMG";
+ let usesCustomInserter = 1;
+}
+
+class MIMG_NoSampler_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ RegisterClass addr_rc,
+ string dns=""> : MIMG_Helper <
+ (outs dst_rc:$vdata),
+ (ins addr_rc:$vaddr, SReg_256:$srsrc,
+ dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+ r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+ asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+ dns>, MIMGe<op> {
+ let ssamp = 0;
+}
+
+multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ int channels> {
+ def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+ !if(!eq(channels, 1), "AMDGPU", "")>,
+ MIMG_Mask<asm#"_V1", channels>;
+ def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
+ MIMG_Mask<asm#"_V2", channels>;
+ def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
+ MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_NoSampler <bits<7> op, string asm> {
+ defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
+ defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
+ defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
+ defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Store_Helper <bits<7> op, string asm,
+ RegisterClass data_rc,
+ RegisterClass addr_rc> : MIMG_Helper <
+ (outs),
+ (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+ dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+ r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+ asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ >, MIMGe<op> {
+ let ssamp = 0;
+ let mayLoad = 1; // TableGen requires this for matching with the intrinsics
+ let mayStore = 1;
+ let hasSideEffects = 1;
+ let hasPostISelHook = 0;
+ let DisableWQM = 1;
+}
+
+multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
+ RegisterClass data_rc,
+ int channels> {
+ def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32>,
+ MIMG_Mask<asm#"_V1", channels>;
+ def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>,
+ MIMG_Mask<asm#"_V2", channels>;
+ def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>,
+ MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_Store <bits<7> op, string asm> {
+ defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
+ defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>;
+ defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>;
+ defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
+ RegisterClass addr_rc> : MIMG_Helper <
+ (outs data_rc:$vdst),
+ (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+ dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+ r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+ asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ > {
+ let mayStore = 1;
+ let hasSideEffects = 1;
+ let hasPostISelHook = 0;
+ let DisableWQM = 1;
+ let Constraints = "$vdst = $vdata";
+ let AsmMatchConverter = "cvtMIMGAtomic";
+}
+
+class MIMG_Atomic_Real_si<mimg op, string name, string asm,
+ RegisterClass data_rc, RegisterClass addr_rc> :
+ MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+ SIMCInstr<name, SIEncodingFamily.SI>,
+ MIMGe<op.SI> {
+ let isCodeGenOnly = 0;
+ let AssemblerPredicates = [isSICI];
+ let DecoderNamespace = "SICI";
+ let DisableDecoder = DisableSIDecoder;
+}
+
+class MIMG_Atomic_Real_vi<mimg op, string name, string asm,
+ RegisterClass data_rc, RegisterClass addr_rc> :
+ MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+ SIMCInstr<name, SIEncodingFamily.VI>,
+ MIMGe<op.VI> {
+ let isCodeGenOnly = 0;
+ let AssemblerPredicates = [isVI];
+ let DecoderNamespace = "VI";
+ let DisableDecoder = DisableVIDecoder;
+}
+
+multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm,
+ RegisterClass data_rc, RegisterClass addr_rc> {
+ let isPseudo = 1, isCodeGenOnly = 1 in {
+ def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+ SIMCInstr<name, SIEncodingFamily.NONE>;
+ }
+
+ let ssamp = 0 in {
+ def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>;
+
+ def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>;
+ }
+}
+
+multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> {
+ defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>;
+ defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>;
+ defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>;
+}
+
+class MIMG_Sampler_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ RegisterClass src_rc,
+ bit wqm,
+ string dns=""> : MIMG_Helper <
+ (outs dst_rc:$vdata),
+ (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+ dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+ r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+ asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+ dns>, MIMGe<op> {
+ let WQM = wqm;
+}
+
+multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ int channels, bit wqm> {
+ def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm,
+ !if(!eq(channels, 1), "AMDGPU", "")>,
+ MIMG_Mask<asm#"_V1", channels>;
+ def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
+ MIMG_Mask<asm#"_V2", channels>;
+ def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>,
+ MIMG_Mask<asm#"_V4", channels>;
+ def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>,
+ MIMG_Mask<asm#"_V8", channels>;
+ def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>,
+ MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Sampler <bits<7> op, string asm, bit wqm=0> {
+ defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>;
+ defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>;
+ defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>;
+ defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>;
+}
+
+multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>;
+
+class MIMG_Gather_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ RegisterClass src_rc, bit wqm> : MIMG <
+ (outs dst_rc:$vdata),
+ (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+ dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+ r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+ asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+ []>, MIMGe<op> {
+ let mayLoad = 1;
+ let mayStore = 0;
+
+ // DMASK was repurposed for GATHER4. 4 components are always
+ // returned and DMASK works like a swizzle - it selects
+ // the component to fetch. The only useful DMASK values are
+ // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+ // (red,red,red,red) etc.) The ISA document doesn't mention
+ // this.
+ // Therefore, disable all code which updates DMASK by setting this:
+ let Gather4 = 1;
+ let hasPostISelHook = 0;
+ let WQM = wqm;
+
+ let isAsmParserOnly = 1; // TBD: fix it later
+}
+
+multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ int channels, bit wqm> {
+ def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>,
+ MIMG_Mask<asm#"_V1", channels>;
+ def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>,
+ MIMG_Mask<asm#"_V2", channels>;
+ def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>,
+ MIMG_Mask<asm#"_V4", channels>;
+ def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>,
+ MIMG_Mask<asm#"_V8", channels>;
+ def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>,
+ MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Gather <bits<7> op, string asm, bit wqm=0> {
+ defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>;
+ defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>;
+ defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>;
+ defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>;
+}
+
+multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>;
+
+//===----------------------------------------------------------------------===//
+// MIMG Instructions
+//===----------------------------------------------------------------------===//
+let SubtargetPredicate = isGCN in {
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
+//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>;
+//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
+//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
+//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
+defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">;
+defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">;
+//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
+//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
+defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>;
+defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
+defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">;
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI
+defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimg<0x14>, "image_atomic_smin">;
+defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimg<0x15>, "image_atomic_umin">;
+defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimg<0x16>, "image_atomic_smax">;
+defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimg<0x17>, "image_atomic_umax">;
+defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimg<0x18>, "image_atomic_and">;
+defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
+defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
+defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
+defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
+defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">;
+defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">;
+defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">;
+defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
+defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
+defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">;
+defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
+defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
+defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
+defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
+defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
+defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
+defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
+defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">;
+defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
+defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">;
+defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
+defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
+defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
+defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
+defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
+defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
+defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
+defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
+defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
+defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
+defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
+defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">;
+defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
+defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">;
+defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
+defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
+defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">;
+defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
+defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
+defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
+defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
+defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
+defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
+defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
+defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
+defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">;
+defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
+defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
+defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
+defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
+defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
+defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
+defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
+defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
+defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
+defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">;
+defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
+defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
+defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">;
+defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">;
+defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">;
+defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">;
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
+}
+
+/********** ======================= **********/
+/********** Image sampling patterns **********/
+/********** ======================= **********/
+
+// Image + sampler
+class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
+ i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
+ (opcode $addr, $rsrc, $sampler,
+ (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+ (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
+>;
+
+multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
+ def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+ def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+ def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+ def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>;
+ def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>;
+}
+
+// Image + sampler for amdgcn
+// TODO:
+// 1. Handle half data type like v4f16, and add D16 bit support;
+// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128).
+// 3. Add A16 support when we pass address of half type.
+multiclass AMDGCNSamplePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
+ def : Pat<
+ (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc,
+ i1:$slc, i1:$lwe, i1:$da)),
+ (opcode $addr, $rsrc, $sampler,
+ (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+ 0, 0, (as_i1imm $lwe), (as_i1imm $da))
+ >;
+}
+
+multiclass AMDGCNSampleDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
+ defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V1), dt, f32>;
+ defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V2), dt, v2f32>;
+ defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V4), dt, v4f32>;
+ defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V8), dt, v8f32>;
+ defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V16), dt, v16f32>;
+}
+
+// TODO: support v3f32.
+multiclass AMDGCNSamplePatterns<SDPatternOperator name, string opcode> {
+ defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V1), f32>;
+ defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
+ defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
+}
+
+// Image only
+class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm,
+ imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe),
+ (opcode $addr, $rsrc,
+ (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+ (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
+>;
+
+multiclass ImagePatterns<SDPatternOperator name, string opcode> {
+ def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+ def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+ def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+}
+
+multiclass ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
+ def : Pat <
+ (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe,
+ i1:$da)),
+ (opcode $addr, $rsrc,
+ (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
+ 0, 0, (as_i1imm $lwe), (as_i1imm $da))
+ >;
+}
+
+multiclass ImageLoadDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
+ defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
+ defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
+ defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
+}
+
+// TODO: support v3f32.
+multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
+ defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f32>;
+ defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
+ defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
+}
+
+multiclass ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
+ def : Pat <
+ (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc,
+ i1:$lwe, i1:$da),
+ (opcode $data, $addr, $rsrc,
+ (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
+ 0, 0, (as_i1imm $lwe), (as_i1imm $da))
+ >;
+}
+
+multiclass ImageStoreDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
+ defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
+ defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
+ defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
+}
+
+// TODO: support v3f32.
+multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
+ defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), f32>;
+ defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
+ defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
+}
+
+class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+ (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc),
+ (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da))
+>;
+
+multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> {
+ def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>;
+ def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>;
+ def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>;
+}
+
+class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat <
+ (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc,
+ imm:$r128, imm:$da, imm:$slc),
+ (EXTRACT_SUBREG
+ (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1),
+ $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)),
+ sub0)
+>;
+
+// ======= SI Image Intrinsics ================
+
+// Image load
+defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
+defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
+def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
+
+// Basic sample
+defm : SampleRawPatterns<int_SI_image_sample, "IMAGE_SAMPLE">;
+defm : SampleRawPatterns<int_SI_image_sample_cl, "IMAGE_SAMPLE_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_d, "IMAGE_SAMPLE_D">;
+defm : SampleRawPatterns<int_SI_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_l, "IMAGE_SAMPLE_L">;
+defm : SampleRawPatterns<int_SI_image_sample_b, "IMAGE_SAMPLE_B">;
+defm : SampleRawPatterns<int_SI_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_lz, "IMAGE_SAMPLE_LZ">;
+defm : SampleRawPatterns<int_SI_image_sample_cd, "IMAGE_SAMPLE_CD">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">;
+
+// Sample with comparison
+defm : SampleRawPatterns<int_SI_image_sample_c, "IMAGE_SAMPLE_C">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d, "IMAGE_SAMPLE_C_D">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_l, "IMAGE_SAMPLE_C_L">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b, "IMAGE_SAMPLE_C_B">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">;
+
+// Sample with offsets
+defm : SampleRawPatterns<int_SI_image_sample_o, "IMAGE_SAMPLE_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_d_o, "IMAGE_SAMPLE_D_O">;
+defm : SampleRawPatterns<int_SI_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_l_o, "IMAGE_SAMPLE_L_O">;
+defm : SampleRawPatterns<int_SI_image_sample_b_o, "IMAGE_SAMPLE_B_O">;
+defm : SampleRawPatterns<int_SI_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">;
+
+// Sample with comparison and offsets
+defm : SampleRawPatterns<int_SI_image_sample_c_o, "IMAGE_SAMPLE_C_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
+
+// Gather opcodes
+// Only the variants which make sense are defined.
+def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl, IMAGE_GATHER4_CL_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_l, IMAGE_GATHER4_L_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_b, IMAGE_GATHER4_B_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V4, v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c, IMAGE_GATHER4_C_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl, IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz, IMAGE_GATHER4_C_LZ_V4_V4, v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_o, IMAGE_GATHER4_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl_o, IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz_o, IMAGE_GATHER4_LZ_O_V4_V4, v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl_o, IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l_o, IMAGE_GATHER4_C_L_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_o, IMAGE_GATHER4_C_B_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl_o, IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
+
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
+
+// ======= amdgcn Image Intrinsics ==============
+
+// Image load
+defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">;
+defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">;
+defm : ImageLoadPatterns<int_amdgcn_image_getresinfo, "IMAGE_GET_RESINFO">;
+
+// Image store
+defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">;
+defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">;
+
+// Basic sample
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample, "IMAGE_SAMPLE">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl, "IMAGE_SAMPLE_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d, "IMAGE_SAMPLE_D">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l, "IMAGE_SAMPLE_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b, "IMAGE_SAMPLE_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz, "IMAGE_SAMPLE_LZ">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd, "IMAGE_SAMPLE_CD">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">;
+
+// Sample with comparison
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c, "IMAGE_SAMPLE_C">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d, "IMAGE_SAMPLE_C_D">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l, "IMAGE_SAMPLE_C_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b, "IMAGE_SAMPLE_C_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">;
+
+// Sample with offsets
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_o, "IMAGE_SAMPLE_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_o, "IMAGE_SAMPLE_D_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l_o, "IMAGE_SAMPLE_L_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_o, "IMAGE_SAMPLE_B_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">;
+
+// Sample with comparison and offsets
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_o, "IMAGE_SAMPLE_C_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
+
+// Gather opcodes
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4, "IMAGE_GATHER4">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl, "IMAGE_GATHER4_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l, "IMAGE_GATHER4_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b, "IMAGE_GATHER4_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl, "IMAGE_GATHER4_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz, "IMAGE_GATHER4_LZ">;
+
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c, "IMAGE_GATHER4_C">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl, "IMAGE_GATHER4_C_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l, "IMAGE_GATHER4_C_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b, "IMAGE_GATHER4_C_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl, "IMAGE_GATHER4_C_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz, "IMAGE_GATHER4_C_LZ">;
+
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_o, "IMAGE_GATHER4_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl_o, "IMAGE_GATHER4_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l_o, "IMAGE_GATHER4_L_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_o, "IMAGE_GATHER4_B_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl_o, "IMAGE_GATHER4_B_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz_o, "IMAGE_GATHER4_LZ_O">;
+
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_o, "IMAGE_GATHER4_C_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl_o, "IMAGE_GATHER4_C_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l_o, "IMAGE_GATHER4_C_L_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_o, "IMAGE_GATHER4_C_B_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl_o, "IMAGE_GATHER4_C_B_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz_o, "IMAGE_GATHER4_C_LZ_O">;
+
+defm : AMDGCNSamplePatterns<int_amdgcn_image_getlod, "IMAGE_GET_LOD">;
+
+// Image atomics
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">;
+
+/* SIsample for simple 1D texture lookup */
+def : Pat <
+ (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
+ (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
+>;
+
+class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
+ (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
+>;
+
+class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT),
+ (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0)
+>;
+
+class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY),
+ (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
+>;
+
+class SampleShadowPattern<SDNode name, MIMG opcode,
+ ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW),
+ (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
+>;
+
+class SampleShadowArrayPattern<SDNode name, MIMG opcode,
+ ValueType vt> : Pat <
+ (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
+ (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
+>;
+
+/* SIsample* for texture lookups consuming more address parameters */
+multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
+ MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
+MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
+ def : SamplePattern <SIsample, sample, addr_type>;
+ def : SampleRectPattern <SIsample, sample, addr_type>;
+ def : SampleArrayPattern <SIsample, sample, addr_type>;
+ def : SampleShadowPattern <SIsample, sample_c, addr_type>;
+ def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
+
+ def : SamplePattern <SIsamplel, sample_l, addr_type>;
+ def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
+ def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
+ def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
+
+ def : SamplePattern <SIsampleb, sample_b, addr_type>;
+ def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
+ def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
+ def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
+
+ def : SamplePattern <SIsampled, sample_d, addr_type>;
+ def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
+ def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
+ def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
+}
+
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
+ IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
+ IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
+ IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
+ v2i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
+ IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
+ IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
+ IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
+ v4i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
+ IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
+ IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
+ IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
+ v8i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
+ IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
+ IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
+ IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
+ v16i32>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td
index f5f1eb1..3c07cc7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Processors.td
+++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td
@@ -101,55 +101,89 @@ def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
//===----------------------------------------------------------------------===//
def : ProcessorModel<"bonaire", SIQuarterSpeedModel,
- [FeatureSeaIslands, FeatureLDSBankCount32, FeatureISAVersion7_0_0]
+ [FeatureISAVersion7_0_0]
>;
def : ProcessorModel<"kabini", SIQuarterSpeedModel,
- [FeatureSeaIslands, FeatureLDSBankCount16]
+ [FeatureISAVersion7_0_2]
>;
def : ProcessorModel<"kaveri", SIQuarterSpeedModel,
- [FeatureSeaIslands, FeatureLDSBankCount32, FeatureISAVersion7_0_0]
+ [FeatureISAVersion7_0_0]
>;
-def : ProcessorModel<"hawaii", SIFullSpeedModel,
- [FeatureSeaIslands, FeatureFastFMAF32, HalfRate64Ops,
- FeatureLDSBankCount32, FeatureISAVersion7_0_1]
+def : ProcessorModel<"hawaii", SIFullSpeedModel,
+ [FeatureISAVersion7_0_1]
>;
def : ProcessorModel<"mullins", SIQuarterSpeedModel,
- [FeatureSeaIslands, FeatureLDSBankCount16]>;
+ [FeatureISAVersion7_0_2]>;
+
+def : ProcessorModel<"gfx700", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_0]
+>;
+
+def : ProcessorModel<"gfx701", SIFullSpeedModel,
+ [FeatureISAVersion7_0_1]
+>;
+
+def : ProcessorModel<"gfx702", SIQuarterSpeedModel,
+ [FeatureISAVersion7_0_2]
+>;
//===----------------------------------------------------------------------===//
// Volcanic Islands
//===----------------------------------------------------------------------===//
def : ProcessorModel<"tonga", SIQuarterSpeedModel,
- [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0,
- FeatureLDSBankCount32]
+ [FeatureISAVersion8_0_2]
>;
def : ProcessorModel<"iceland", SIQuarterSpeedModel,
- [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0,
- FeatureLDSBankCount32]
+ [FeatureISAVersion8_0_0]
>;
def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
- [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32]
+ [FeatureISAVersion8_0_1]
>;
-def : ProcessorModel<"fiji", SIQuarterSpeedModel,
- [FeatureVolcanicIslands, FeatureISAVersion8_0_3, FeatureLDSBankCount32]
+def : ProcessorModel<"fiji", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_3]
>;
-def : ProcessorModel<"stoney", SIQuarterSpeedModel,
- [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16]
+def : ProcessorModel<"stoney", SIQuarterSpeedModel,
+ [FeatureISAVersion8_1_0]
>;
def : ProcessorModel<"polaris10", SIQuarterSpeedModel,
- [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32]
+ [FeatureISAVersion8_0_3]
>;
def : ProcessorModel<"polaris11", SIQuarterSpeedModel,
- [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32]
+ [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"gfx800", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_0]
+>;
+
+def : ProcessorModel<"gfx801", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_1]
>;
+
+def : ProcessorModel<"gfx802", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_2]
+>;
+
+def : ProcessorModel<"gfx803", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"gfx804", SIQuarterSpeedModel,
+ [FeatureISAVersion8_0_4]
+>;
+
+def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
+ [FeatureISAVersion8_1_0]
+>;
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 3ccde79..d0aba38 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -66,7 +66,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override;
+ StringRef getPassName() const override;
};
char R600ClauseMergePass::ID = 0;
@@ -201,7 +201,7 @@ bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
return false;
}
-const char *R600ClauseMergePass::getPassName() const {
+StringRef R600ClauseMergePass::getPassName() const {
return "R600 Merge Clause Markers Pass";
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index d5bda4a..45b36d3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -354,10 +354,10 @@ private:
if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
continue;
int64_t Imm = Src.second;
- std::vector<MachineOperand*>::iterator It =
- std::find_if(Lits.begin(), Lits.end(),
- [&](MachineOperand* val)
- { return val->isImm() && (val->getImm() == Imm);});
+ std::vector<MachineOperand *>::iterator It =
+ find_if(Lits, [&](MachineOperand *val) {
+ return val->isImm() && (val->getImm() == Imm);
+ });
// Get corresponding Operand
MachineOperand &Operand = MI.getOperand(
@@ -450,27 +450,24 @@ private:
return ClauseFile(&ClauseHead, std::move(ClauseContent));
}
- void
- EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
- unsigned &CfCount) {
+ void EmitFetchClause(MachineBasicBlock::iterator InsertPos,
+ const DebugLoc &DL, ClauseFile &Clause,
+ unsigned &CfCount) {
CounterPropagateAddr(*Clause.first, CfCount);
MachineBasicBlock *BB = Clause.first->getParent();
- BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
- .addImm(CfCount);
+ BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount);
for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
BB->splice(InsertPos, BB, Clause.second[i]);
}
CfCount += 2 * Clause.second.size();
}
- void
- EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
- unsigned &CfCount) {
+ void EmitALUClause(MachineBasicBlock::iterator InsertPos, const DebugLoc &DL,
+ ClauseFile &Clause, unsigned &CfCount) {
Clause.first->getOperand(0).setImm(0);
CounterPropagateAddr(*Clause.first, CfCount);
MachineBasicBlock *BB = Clause.first->getParent();
- BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
- .addImm(CfCount);
+ BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount);
for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
BB->splice(InsertPos, BB, Clause.second[i]);
}
@@ -644,17 +641,18 @@ public:
break;
}
case AMDGPU::RETURN: {
- BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END));
CfCount++;
if (CfCount % 2) {
- BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD));
CfCount++;
}
MI->eraseFromParent();
for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
- EmitFetchClause(I, FetchClauses[i], CfCount);
+ EmitFetchClause(I, DL, FetchClauses[i], CfCount);
for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
- EmitALUClause(I, AluClauses[i], CfCount);
+ EmitALUClause(I, DL, AluClauses[i], CfCount);
break;
}
default:
@@ -680,13 +678,13 @@ public:
.addImm(Alu->getOperand(8).getImm());
Alu->eraseFromParent();
}
- MFI->StackSize = CFStack.MaxStackSize;
+ MFI->CFStackSize = CFStack.MaxStackSize;
}
return false;
}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "R600 Control Flow Finalizer Pass";
}
};
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 93ed5be..9a5db6c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -307,7 +307,7 @@ public:
BB != BB_E; ++BB) {
MachineBasicBlock &MBB = *BB;
MachineBasicBlock::iterator I = MBB.begin();
- if (I->getOpcode() == AMDGPU::CF_ALU)
+ if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU)
continue; // BB was already parsed
for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
if (isALU(*I))
@@ -319,7 +319,7 @@ public:
return false;
}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "R600 Emit Clause Markers Pass";
}
};
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 0385b62..3e46e63 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -42,7 +42,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "R600 Expand special instructions pass";
}
};
@@ -116,85 +116,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
MI.eraseFromParent();
continue;
}
-
- case AMDGPU::INTERP_PAIR_XY: {
- MachineInstr *BMI;
- unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
- MI.getOperand(2).getImm());
-
- for (unsigned Chan = 0; Chan < 4; ++Chan) {
- unsigned DstReg;
-
- if (Chan < 2)
- DstReg = MI.getOperand(Chan).getReg();
- else
- DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W;
-
- BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY,
- DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
-
- if (Chan > 0) {
- BMI->bundleWithPred();
- }
- if (Chan >= 2)
- TII->addFlag(*BMI, 0, MO_FLAG_MASK);
- if (Chan != 3)
- TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
- }
-
- MI.eraseFromParent();
- continue;
- }
-
- case AMDGPU::INTERP_PAIR_ZW: {
- MachineInstr *BMI;
- unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
- MI.getOperand(2).getImm());
-
- for (unsigned Chan = 0; Chan < 4; ++Chan) {
- unsigned DstReg;
-
- if (Chan < 2)
- DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y;
- else
- DstReg = MI.getOperand(Chan-2).getReg();
-
- BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW,
- DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
-
- if (Chan > 0) {
- BMI->bundleWithPred();
- }
- if (Chan < 2)
- TII->addFlag(*BMI, 0, MO_FLAG_MASK);
- if (Chan != 3)
- TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
- }
-
- MI.eraseFromParent();
- continue;
- }
-
- case AMDGPU::INTERP_VEC_LOAD: {
- const R600RegisterInfo &TRI = TII->getRegisterInfo();
- MachineInstr *BMI;
- unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
- MI.getOperand(1).getImm());
- unsigned DstReg = MI.getOperand(0).getReg();
-
- for (unsigned Chan = 0; Chan < 4; ++Chan) {
- BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0,
- TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg);
- if (Chan > 0) {
- BMI->bundleWithPred();
- }
- if (Chan != 3)
- TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
- }
-
- MI.eraseFromParent();
- continue;
- }
case AMDGPU::DOT_4: {
const R600RegisterInfo &TRI = TII->getRegisterInfo();
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
index dd5681f..5813786 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -11,5 +11,4 @@
using namespace llvm;
-R600FrameLowering::~R600FrameLowering() {
-}
+R600FrameLowering::~R600FrameLowering() = default;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h
index 5fe4e0d..874435f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h
@@ -19,12 +19,14 @@ public:
R600FrameLowering(StackDirection D, unsigned StackAl, int LAO,
unsigned TransAl = 1) :
AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
- virtual ~R600FrameLowering();
+ ~R600FrameLowering() override;
- void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const {}
- void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {}
+ void emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const override {}
+ void emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const override {}
};
-}
+} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 8ccd176..77fee435 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -17,16 +17,36 @@
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "R600Defines.h"
+#include "R600FrameLowering.h"
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/DAGCombine.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+#include <vector>
using namespace llvm;
@@ -72,7 +92,6 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
-
setOperationAction(ISD::STORE, MVT::i8, Custom);
setOperationAction(ISD::STORE, MVT::i32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
@@ -80,6 +99,18 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::i32, MVT::i8, Custom);
setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+ // We need to include these since trunc STORES to PRIVATE need
+ // special handling to accommodate RMW
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom);
+ setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom);
+ setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom);
// Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
@@ -192,12 +223,12 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setSchedulingPreference(Sched::Source);
-
setTargetDAGCombine(ISD::FP_ROUND);
setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::LOAD);
}
const R600Subtarget *R600TargetLowering::getSubtarget() const {
@@ -205,13 +236,15 @@ const R600Subtarget *R600TargetLowering::getSubtarget() const {
}
static inline bool isEOP(MachineBasicBlock::iterator I) {
+ if (std::next(I) == I->getParent()->end())
+ return false;
return std::next(I)->getOpcode() == AMDGPU::RETURN;
}
MachineBasicBlock *
R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
- MachineFunction * MF = BB->getParent();
+ MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock::iterator I = MI;
const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
@@ -278,10 +311,12 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.bitcastToAPInt()
.getZExtValue());
break;
+
case AMDGPU::MOV_IMM_I32:
TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
MI.getOperand(1).getImm());
break;
+
case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
//TODO: Perhaps combine this instruction with the next if possible
auto MIB = TII->buildDefaultInstruction(
@@ -291,6 +326,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MIB->getOperand(Idx) = MI.getOperand(1);
break;
}
+
case AMDGPU::CONST_COPY: {
MachineInstr *NewMI = TII->buildDefaultInstruction(
*BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
@@ -301,228 +337,20 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
- case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
+ case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
.addOperand(MI.getOperand(0))
.addOperand(MI.getOperand(1))
.addImm(isEOP(I)); // Set End of program bit
break;
- }
- case AMDGPU::RAT_STORE_TYPED_eg: {
+
+ case AMDGPU::RAT_STORE_TYPED_eg:
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
.addOperand(MI.getOperand(0))
.addOperand(MI.getOperand(1))
.addOperand(MI.getOperand(2))
.addImm(isEOP(I)); // Set End of program bit
break;
- }
-
- case AMDGPU::TXD: {
- unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
- unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
- MachineOperand &RID = MI.getOperand(4);
- MachineOperand &SID = MI.getOperand(5);
- unsigned TextureId = MI.getOperand(6).getImm();
- unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
- unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
-
- switch (TextureId) {
- case 5: // Rect
- CTX = CTY = 0;
- break;
- case 6: // Shadow1D
- SrcW = SrcZ;
- break;
- case 7: // Shadow2D
- SrcW = SrcZ;
- break;
- case 8: // ShadowRect
- CTX = CTY = 0;
- SrcW = SrcZ;
- break;
- case 9: // 1DArray
- SrcZ = SrcY;
- CTZ = 0;
- break;
- case 10: // 2DArray
- CTZ = 0;
- break;
- case 11: // Shadow1DArray
- SrcZ = SrcY;
- CTZ = 0;
- break;
- case 12: // Shadow2DArray
- CTZ = 0;
- break;
- }
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
- T0)
- .addOperand(MI.getOperand(3))
- .addImm(SrcX)
- .addImm(SrcY)
- .addImm(SrcZ)
- .addImm(SrcW)
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(1)
- .addImm(2)
- .addImm(3)
- .addOperand(RID)
- .addOperand(SID)
- .addImm(CTX)
- .addImm(CTY)
- .addImm(CTZ)
- .addImm(CTW);
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
- T1)
- .addOperand(MI.getOperand(2))
- .addImm(SrcX)
- .addImm(SrcY)
- .addImm(SrcZ)
- .addImm(SrcW)
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(1)
- .addImm(2)
- .addImm(3)
- .addOperand(RID)
- .addOperand(SID)
- .addImm(CTX)
- .addImm(CTY)
- .addImm(CTZ)
- .addImm(CTW);
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
- .addOperand(MI.getOperand(0))
- .addOperand(MI.getOperand(1))
- .addImm(SrcX)
- .addImm(SrcY)
- .addImm(SrcZ)
- .addImm(SrcW)
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(1)
- .addImm(2)
- .addImm(3)
- .addOperand(RID)
- .addOperand(SID)
- .addImm(CTX)
- .addImm(CTY)
- .addImm(CTZ)
- .addImm(CTW)
- .addReg(T0, RegState::Implicit)
- .addReg(T1, RegState::Implicit);
- break;
- }
-
- case AMDGPU::TXD_SHADOW: {
- unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
- unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
- MachineOperand &RID = MI.getOperand(4);
- MachineOperand &SID = MI.getOperand(5);
- unsigned TextureId = MI.getOperand(6).getImm();
- unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
- unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
-
- switch (TextureId) {
- case 5: // Rect
- CTX = CTY = 0;
- break;
- case 6: // Shadow1D
- SrcW = SrcZ;
- break;
- case 7: // Shadow2D
- SrcW = SrcZ;
- break;
- case 8: // ShadowRect
- CTX = CTY = 0;
- SrcW = SrcZ;
- break;
- case 9: // 1DArray
- SrcZ = SrcY;
- CTZ = 0;
- break;
- case 10: // 2DArray
- CTZ = 0;
- break;
- case 11: // Shadow1DArray
- SrcZ = SrcY;
- CTZ = 0;
- break;
- case 12: // Shadow2DArray
- CTZ = 0;
- break;
- }
-
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
- T0)
- .addOperand(MI.getOperand(3))
- .addImm(SrcX)
- .addImm(SrcY)
- .addImm(SrcZ)
- .addImm(SrcW)
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(1)
- .addImm(2)
- .addImm(3)
- .addOperand(RID)
- .addOperand(SID)
- .addImm(CTX)
- .addImm(CTY)
- .addImm(CTZ)
- .addImm(CTW);
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
- T1)
- .addOperand(MI.getOperand(2))
- .addImm(SrcX)
- .addImm(SrcY)
- .addImm(SrcZ)
- .addImm(SrcW)
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(1)
- .addImm(2)
- .addImm(3)
- .addOperand(RID)
- .addOperand(SID)
- .addImm(CTX)
- .addImm(CTY)
- .addImm(CTZ)
- .addImm(CTW);
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
- .addOperand(MI.getOperand(0))
- .addOperand(MI.getOperand(1))
- .addImm(SrcX)
- .addImm(SrcY)
- .addImm(SrcZ)
- .addImm(SrcW)
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(1)
- .addImm(2)
- .addImm(3)
- .addOperand(RID)
- .addOperand(SID)
- .addImm(CTX)
- .addImm(CTY)
- .addImm(CTZ)
- .addImm(CTW)
- .addReg(T0, RegState::Implicit)
- .addReg(T1, RegState::Implicit);
- break;
- }
case AMDGPU::BRANCH:
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
@@ -534,7 +362,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
AMDGPU::PREDICATE_BIT)
.addOperand(MI.getOperand(1))
- .addImm(OPCODE_IS_NOT_ZERO)
+ .addImm(AMDGPU::PRED_SETNE)
.addImm(0); // Flags
TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
@@ -548,7 +376,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
AMDGPU::PREDICATE_BIT)
.addOperand(MI.getOperand(1))
- .addImm(OPCODE_IS_NOT_ZERO_INT)
+ .addImm(AMDGPU::PRED_SETNE_INT)
.addImm(0); // Flags
TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
@@ -592,12 +420,6 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
break;
}
case AMDGPU::RETURN: {
- // RETURN instructions must have the live-out registers as implicit uses,
- // otherwise they appear dead.
- R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
- MachineInstrBuilder MIB(*MF, MI);
- for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
- MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
return BB;
}
}
@@ -654,7 +476,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
DAG.getConstant(3, DL, MVT::i32) // SWZ_W
};
- return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
+ return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args);
}
// default for switch(IntrinsicID)
@@ -671,15 +493,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
switch(IntrinsicID) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case AMDGPUIntrinsic::r600_tex:
- case AMDGPUIntrinsic::r600_texc:
- case AMDGPUIntrinsic::r600_txl:
- case AMDGPUIntrinsic::r600_txlc:
- case AMDGPUIntrinsic::r600_txb:
- case AMDGPUIntrinsic::r600_txbc:
- case AMDGPUIntrinsic::r600_txf:
- case AMDGPUIntrinsic::r600_txq:
- case AMDGPUIntrinsic::r600_ddx:
- case AMDGPUIntrinsic::r600_ddy: {
+ case AMDGPUIntrinsic::r600_texc: {
unsigned TextureOp;
switch (IntrinsicID) {
case AMDGPUIntrinsic::r600_tex:
@@ -688,32 +502,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
case AMDGPUIntrinsic::r600_texc:
TextureOp = 1;
break;
- case AMDGPUIntrinsic::r600_txl:
- TextureOp = 2;
- break;
- case AMDGPUIntrinsic::r600_txlc:
- TextureOp = 3;
- break;
- case AMDGPUIntrinsic::r600_txb:
- TextureOp = 4;
- break;
- case AMDGPUIntrinsic::r600_txbc:
- TextureOp = 5;
- break;
- case AMDGPUIntrinsic::r600_txf:
- TextureOp = 6;
- break;
- case AMDGPUIntrinsic::r600_txq:
- TextureOp = 7;
- break;
- case AMDGPUIntrinsic::r600_ddx:
- TextureOp = 8;
- break;
- case AMDGPUIntrinsic::r600_ddy:
- TextureOp = 9;
- break;
default:
- llvm_unreachable("Unknow Texture Operation");
+ llvm_unreachable("unhandled texture operation");
}
SDValue TexArgs[19] = {
@@ -785,12 +575,6 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::r600_read_local_size_z:
return LowerImplicitParameter(DAG, VT, DL, 8);
- case Intrinsic::r600_read_workdim:
- case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name.
- uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
- return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
- }
-
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
AMDGPU::T1_X, VT);
@@ -836,9 +620,10 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
return;
}
- // Fall-through. Since we don't care about out of bounds values
- // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
- // considers some extra cases which are not necessary here.
+ // Since we don't care about out of bounds values we can use FP_TO_SINT for
+ // uints too. The DAGLegalizer code for uint considers some extra cases
+ // which are not necessary here.
+ LLVM_FALLTHROUGH;
case ISD::FP_TO_SINT: {
if (N->getValueType(0) == MVT::i1) {
Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
@@ -867,14 +652,12 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
SDValue Vector) const {
-
SDLoc DL(Vector);
EVT VecVT = Vector.getValueType();
EVT EltVT = VecVT.getVectorElementType();
SmallVector<SDValue, 8> Args;
- for (unsigned i = 0, e = VecVT.getVectorNumElements();
- i != e; ++i) {
+ for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) {
Args.push_back(DAG.getNode(
ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
@@ -885,7 +668,6 @@ SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
-
SDLoc DL(Op);
SDValue Vector = Op.getOperand(0);
SDValue Index = Op.getOperand(1);
@@ -919,7 +701,6 @@ SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,
SelectionDAG &DAG) const {
-
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
@@ -1318,90 +1099,158 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth,
SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
SelectionDAG &DAG) const {
SDLoc DL(Store);
+ //TODO: Who creates the i8 stores?
+ assert(Store->isTruncatingStore()
+ || Store->getValue().getValueType() == MVT::i8);
+ assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS);
- unsigned Mask = 0;
+ SDValue Mask;
if (Store->getMemoryVT() == MVT::i8) {
- Mask = 0xff;
+ assert(Store->getAlignment() >= 1);
+ Mask = DAG.getConstant(0xff, DL, MVT::i32);
} else if (Store->getMemoryVT() == MVT::i16) {
- Mask = 0xffff;
+ assert(Store->getAlignment() >= 2);
+ Mask = DAG.getConstant(0xffff, DL, MVT::i32);;
+ } else {
+ llvm_unreachable("Unsupported private trunc store");
}
- SDValue Chain = Store->getChain();
+ SDValue OldChain = Store->getChain();
+ bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN);
+ // Skip dummy
+ SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain;
SDValue BasePtr = Store->getBasePtr();
+ SDValue Offset = Store->getOffset();
EVT MemVT = Store->getMemoryVT();
- SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
- DAG.getConstant(2, DL, MVT::i32));
- SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
- Chain, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32));
+ SDValue LoadPtr = BasePtr;
+ if (!Offset.isUndef()) {
+ LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
+ }
+
+ // Get dword location
+ // TODO: this should be eliminated by the future SHR ptr, 2
+ SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
+ DAG.getConstant(0xfffffffc, DL, MVT::i32));
+
+ // Load dword
+ // TODO: can we be smarter about machine pointer info?
+ SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());
+
+ Chain = Dst.getValue(1);
- SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
+ // Get offset in dword
+ SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
DAG.getConstant(0x3, DL, MVT::i32));
+ // Convert byte offset to bit shift
SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
DAG.getConstant(3, DL, MVT::i32));
+ // TODO: Contrary to the name of the functiom,
+ // it also handles sub i32 non-truncating stores (like i1)
SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
Store->getValue());
+ // Mask the value to the right type
SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
+ // Shift the value in place
SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
MaskedValue, ShiftAmt);
- SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
- DAG.getConstant(Mask, DL, MVT::i32),
- ShiftAmt);
- DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
- DAG.getConstant(0xffffffff, DL, MVT::i32));
+ // Shift the mask in place
+ SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt);
+
+ // Invert the mask. NOTE: if we had native ROL instructions we could
+ // use inverted mask
+ DstMask = DAG.getNOT(DL, DstMask, MVT::i32);
+
+ // Cleanup the target bits
Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+ // Add the new bits
SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
- return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
- Chain, Value, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32));
+
+ // Store dword
+ // TODO: Can we be smarter about MachinePointerInfo?
+ SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo());
+
+ // If we are part of expanded vector, make our neighbors depend on this store
+ if (VectorTrunc) {
+ // Make all other vector elements depend on this store
+ Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore);
+ DAG.ReplaceAllUsesOfValueWith(OldChain, Chain);
+ }
+ return NewStore;
}
SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
- if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG))
- return Result;
-
StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
unsigned AS = StoreNode->getAddressSpace();
+
+ SDValue Chain = StoreNode->getChain();
+ SDValue Ptr = StoreNode->getBasePtr();
SDValue Value = StoreNode->getValue();
- EVT ValueVT = Value.getValueType();
+ EVT VT = Value.getValueType();
+ EVT MemVT = StoreNode->getMemoryVT();
+ EVT PtrVT = Ptr.getValueType();
+
+ SDLoc DL(Op);
+
+ // Neither LOCAL nor PRIVATE can do vectors at the moment
if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
- ValueVT.isVector()) {
- return SplitVectorStore(Op, DAG);
+ VT.isVector()) {
+ if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && StoreNode->isTruncatingStore()) {
+ // Add an extra level of chain to isolate this vector
+ SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
+ // TODO: can the chain be replaced without creating a new store?
+ SDValue NewStore = DAG.getTruncStore(
+ NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(),
+ MemVT, StoreNode->getAlignment(),
+ StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo());
+ StoreNode = cast<StoreSDNode>(NewStore);
+ }
+
+ return scalarizeVectorStore(StoreNode, DAG);
}
- SDLoc DL(Op);
- SDValue Chain = StoreNode->getChain();
- SDValue Ptr = StoreNode->getBasePtr();
+ unsigned Align = StoreNode->getAlignment();
+ if (Align < MemVT.getStoreSize() &&
+ !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
+ return expandUnalignedStore(StoreNode, DAG);
+ }
+
+ SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
+ DAG.getConstant(2, DL, PtrVT));
if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
+ // It is beneficial to create MSKOR here instead of combiner to avoid
+ // artificial dependencies introduced by RMW
if (StoreNode->isTruncatingStore()) {
- EVT VT = Value.getValueType();
assert(VT.bitsLE(MVT::i32));
- EVT MemVT = StoreNode->getMemoryVT();
SDValue MaskConstant;
if (MemVT == MVT::i8) {
MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
} else {
assert(MemVT == MVT::i16);
+ assert(StoreNode->getAlignment() >= 2);
MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
}
- SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
- DAG.getConstant(2, DL, MVT::i32));
- SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
- DAG.getConstant(0x00000003, DL, VT));
+
+ SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr,
+ DAG.getConstant(0x00000003, DL, PtrVT));
+ SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
+ DAG.getConstant(3, DL, VT));
+
+ // Put the mask in correct place
+ SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift);
+
+ // Put the value bits in correct place
SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
- SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
- DAG.getConstant(3, DL, VT));
- SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
- SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
+ SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift);
+
// XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
// vector instead.
SDValue Src[4] = {
@@ -1415,12 +1264,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
Op->getVTList(), Args, MemVT,
StoreNode->getMemOperand());
- } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
- ValueVT.bitsGE(MVT::i32)) {
+ } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) {
// Convert pointer from byte address to dword address.
- Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
- DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
- Ptr, DAG.getConstant(2, DL, MVT::i32)));
+ Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
llvm_unreachable("Truncated and indexed stores not supported yet");
@@ -1431,50 +1277,22 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
}
}
+ // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
if (AS != AMDGPUAS::PRIVATE_ADDRESS)
return SDValue();
- EVT MemVT = StoreNode->getMemoryVT();
if (MemVT.bitsLT(MVT::i32))
return lowerPrivateTruncStore(StoreNode, DAG);
- // Lowering for indirect addressing
- const MachineFunction &MF = DAG.getMachineFunction();
- const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
- unsigned StackWidth = TFL->getStackWidth(MF);
-
- Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
-
- if (ValueVT.isVector()) {
- unsigned NumElemVT = ValueVT.getVectorNumElements();
- EVT ElemVT = ValueVT.getVectorElementType();
- SmallVector<SDValue, 4> Stores(NumElemVT);
-
- assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
- "vector width in load");
-
- for (unsigned i = 0; i < NumElemVT; ++i) {
- unsigned Channel, PtrIncr;
- getStackAddress(StackWidth, i, Channel, PtrIncr);
- Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
- DAG.getConstant(PtrIncr, DL, MVT::i32));
- SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
- Value, DAG.getConstant(i, DL, MVT::i32));
-
- Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
- Chain, Elem, Ptr,
- DAG.getTargetConstant(Channel, DL, MVT::i32));
- }
- Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
- } else {
- if (ValueVT == MVT::i8) {
- Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
- }
- Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
+ // Standard i32+ store, tag it with DWORDADDR to note that the address
+ // has been shifted
+ if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
+ Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
+ return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
}
- return Chain;
+ // Tagged i32+ stores will be matched by patterns
+ return SDValue();
}
// return (512 + (kc_bank << 12)
@@ -1524,51 +1342,50 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
LoadSDNode *Load = cast<LoadSDNode>(Op);
ISD::LoadExtType ExtType = Load->getExtensionType();
EVT MemVT = Load->getMemoryVT();
+ assert(Load->getAlignment() >= MemVT.getStoreSize());
+
+ SDValue BasePtr = Load->getBasePtr();
+ SDValue Chain = Load->getChain();
+ SDValue Offset = Load->getOffset();
+
+ SDValue LoadPtr = BasePtr;
+ if (!Offset.isUndef()) {
+ LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
+ }
- // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
- // register (2-)byte extract.
+ // Get dword location
+ // NOTE: this should be eliminated by the future SHR ptr, 2
+ SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
+ DAG.getConstant(0xfffffffc, DL, MVT::i32));
- // Get Register holding the target.
- SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
- DAG.getConstant(2, DL, MVT::i32));
- // Load the Register.
- SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
- Load->getChain(),
- Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32),
- Op.getOperand(2));
+ // Load dword
+ // TODO: can we be smarter about machine pointer info?
+ SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());
// Get offset within the register.
SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
- Load->getBasePtr(),
- DAG.getConstant(0x3, DL, MVT::i32));
+ LoadPtr, DAG.getConstant(0x3, DL, MVT::i32));
// Bit offset of target byte (byteIdx * 8).
SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
DAG.getConstant(3, DL, MVT::i32));
// Shift to the right.
- Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+ SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt);
// Eliminate the upper bits by setting them to ...
EVT MemEltVT = MemVT.getScalarType();
- // ... ones.
- if (ExtType == ISD::SEXTLOAD) {
+ if (ExtType == ISD::SEXTLOAD) { // ... ones.
SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
-
- SDValue Ops[] = {
- DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
- Load->getChain()
- };
-
- return DAG.getMergeValues(Ops, DL);
+ Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
+ } else { // ... or zeros.
+ Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
}
- // ... or zeros.
SDValue Ops[] = {
- DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
- Load->getChain()
+ Ret,
+ Read.getValue(1) // This should be our output chain
};
return DAG.getMergeValues(Ops, DL);
@@ -1590,12 +1407,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = LoadNode->getChain();
SDValue Ptr = LoadNode->getBasePtr();
- if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
- SDValue MergedValues[2] = {
- scalarizeVectorLoad(LoadNode, DAG),
- Chain
- };
- return DAG.getMergeValues(MergedValues, DL);
+ if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+ VT.isVector()) {
+ return scalarizeVectorLoad(LoadNode, DAG);
}
int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
@@ -1646,8 +1461,6 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMergeValues(MergedValues, DL);
}
- SDValue LoweredLoad;
-
// For most operations returning SDValue() will result in the node being
// expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
// need to manually expand loads that may be legal in some address spaces and
@@ -1672,47 +1485,14 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
- // Lowering for indirect addressing
- const MachineFunction &MF = DAG.getMachineFunction();
- const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
- unsigned StackWidth = TFL->getStackWidth(MF);
-
- Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
-
- if (VT.isVector()) {
- unsigned NumElemVT = VT.getVectorNumElements();
- EVT ElemVT = VT.getVectorElementType();
- SDValue Loads[4];
-
- assert(NumElemVT <= 4);
- assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
- "vector width in load");
-
- for (unsigned i = 0; i < NumElemVT; ++i) {
- unsigned Channel, PtrIncr;
- getStackAddress(StackWidth, i, Channel, PtrIncr);
- Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
- DAG.getConstant(PtrIncr, DL, MVT::i32));
- Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
- Chain, Ptr,
- DAG.getTargetConstant(Channel, DL, MVT::i32),
- Op.getOperand(2));
- }
- EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
- LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
- } else {
- LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
- Chain, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32), // Channel
- Op.getOperand(2));
+ // DWORDADDR ISD marks already shifted address
+ if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
+ assert(VT == MVT::i32);
+ Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32));
+ Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr);
+ return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand());
}
-
- SDValue Ops[2] = {
- LoweredLoad,
- Chain
- };
-
- return DAG.getMergeValues(Ops, DL);
+ return SDValue();
}
SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
@@ -1754,9 +1534,11 @@ SDValue R600TargetLowering::LowerFormalArguments(
SmallVector<ISD::InputArg, 8> LocalIns;
- getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
-
- AnalyzeFormalArguments(CCInfo, LocalIns);
+ if (AMDGPU::isShader(CallConv)) {
+ AnalyzeFormalArguments(CCInfo, Ins);
+ } else {
+ analyzeFormalArgumentsCompute(CCInfo, Ins);
+ }
for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
CCValAssign &VA = ArgLocs[i];
@@ -1800,18 +1582,19 @@ SDValue R600TargetLowering::LowerFormalArguments(
unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
unsigned PartOffset = VA.getLocMemOffset();
- unsigned Offset = 36 + VA.getLocMemOffset();
+ unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset();
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
SDValue Arg = DAG.getLoad(
ISD::UNINDEXED, Ext, VT, DL, Chain,
DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
- MemVT, /* Alignment = */ 4,
- MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant);
+ MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
// 4 is the preferred alignment for the CONSTANT memory space.
InVals.push_back(Arg);
- MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
+ MFI->setABIArgOffset(Offset + MemVT.getStoreSize());
}
return Chain;
}
@@ -1949,7 +1732,6 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
return BuildVector;
}
-
//===----------------------------------------------------------------------===//
// Custom DAG Optimizations
//===----------------------------------------------------------------------===//
@@ -1957,14 +1739,14 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
switch (N->getOpcode()) {
- default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
// (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
case ISD::FP_ROUND: {
SDValue Arg = N->getOperand(0);
if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
- return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
+ return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0),
Arg.getOperand(0));
}
break;
@@ -1989,12 +1771,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
}
- SDLoc dl(N);
- return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
+ return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0),
SelectCC.getOperand(0), // LHS
SelectCC.getOperand(1), // RHS
- DAG.getConstant(-1, dl, MVT::i32), // True
- DAG.getConstant(0, dl, MVT::i32), // False
+ DAG.getConstant(-1, DL, MVT::i32), // True
+ DAG.getConstant(0, DL, MVT::i32), // False
SelectCC.getOperand(4)); // CC
break;
@@ -2006,7 +1787,6 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
SDValue InVec = N->getOperand(0);
SDValue InVal = N->getOperand(1);
SDValue EltNo = N->getOperand(2);
- SDLoc dl(N);
// If the inserted element is an UNDEF, just use the input vector.
if (InVal.isUndef())
@@ -2044,13 +1824,13 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
EVT OpVT = Ops[0].getValueType();
if (InVal.getValueType() != OpVT)
InVal = OpVT.bitsGT(InVal.getValueType()) ?
- DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
- DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
+ DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) :
+ DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal);
Ops[Elt] = InVal;
}
// Return the new vector
- return DAG.getBuildVector(VT, dl, Ops);
+ return DAG.getBuildVector(VT, DL, Ops);
}
// Extract_vec (Build_vector) generated by custom lowering
@@ -2064,11 +1844,13 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
}
}
if (Arg.getOpcode() == ISD::BITCAST &&
- Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+ (Arg.getOperand(0).getValueType().getVectorNumElements() ==
+ Arg.getValueType().getVectorNumElements())) {
if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
unsigned Element = Const->getZExtValue();
- return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
- Arg->getOperand(0).getOperand(Element));
+ return DAG.getNode(ISD::BITCAST, DL, N->getVTList(),
+ Arg->getOperand(0).getOperand(Element));
}
}
break;
@@ -2109,7 +1891,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
LHS.getOperand(0).getValueType().isInteger());
if (DCI.isBeforeLegalizeOps() ||
isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
- return DAG.getSelectCC(SDLoc(N),
+ return DAG.getSelectCC(DL,
LHS.getOperand(0),
LHS.getOperand(1),
LHS.getOperand(2),
@@ -2121,7 +1903,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
}
- case AMDGPUISD::EXPORT: {
+ case AMDGPUISD::R600_EXPORT: {
SDValue Arg = N->getOperand(1);
if (Arg.getOpcode() != ISD::BUILD_VECTOR)
break;
@@ -2136,9 +1918,8 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
N->getOperand(6), // SWZ_Z
N->getOperand(7) // SWZ_W
};
- SDLoc DL(N);
NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
- return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
+ return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs);
}
case AMDGPUISD::TEXTURE_FETCH: {
SDValue Arg = N->getOperand(1);
@@ -2166,10 +1947,10 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
N->getOperand(17),
N->getOperand(18),
};
- SDLoc DL(N);
NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
}
+ default: break;
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
@@ -2262,7 +2043,6 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
uint64_t ImmValue = 0;
-
if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
float FloatValue = FPC->getValueAPF().convertToFloat();
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td
index 0ffd485..68fcc54 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td
@@ -210,14 +210,14 @@ class VTX_WORD0 {
bits<5> VC_INST;
bits<2> FETCH_TYPE;
bits<1> FETCH_WHOLE_QUAD;
- bits<8> BUFFER_ID;
+ bits<8> buffer_id;
bits<1> SRC_REL;
bits<2> SRC_SEL_X;
let Word0{4-0} = VC_INST;
let Word0{6-5} = FETCH_TYPE;
let Word0{7} = FETCH_WHOLE_QUAD;
- let Word0{15-8} = BUFFER_ID;
+ let Word0{15-8} = buffer_id;
let Word0{22-16} = src_gpr;
let Word0{23} = SRC_REL;
let Word0{25-24} = SRC_SEL_X;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 1c5f7ec..e88bd07 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -320,12 +320,12 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
const DenseMap<unsigned, unsigned> &PV,
unsigned &ConstCount) const {
ConstCount = 0;
- ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI);
const std::pair<int, unsigned> DummyPair(-1, 0);
std::vector<std::pair<int, unsigned> > Result;
unsigned i = 0;
- for (unsigned n = Srcs.size(); i < n; ++i) {
- unsigned Reg = Srcs[i].first->getReg();
+ for (const auto &Src : getSrcs(MI)) {
+ ++i;
+ unsigned Reg = Src.first->getReg();
int Index = RI.getEncodingValue(Reg) & 0xff;
if (Reg == AMDGPU::OQAP) {
Result.push_back(std::make_pair(Index, 0U));
@@ -592,9 +592,7 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
if (!isALUInstr(MI.getOpcode()))
continue;
- ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI);
-
- for (const auto &Src:Srcs) {
+ for (const auto &Src : getSrcs(MI)) {
if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
Literals.insert(Src.second);
if (Literals.size() > 4)
@@ -667,7 +665,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
// handled
if (isBranch(I->getOpcode()))
return true;
- if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) {
+ if (!isJump(I->getOpcode())) {
return false;
}
@@ -682,8 +680,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
// If there is only one terminator instruction, process it.
unsigned LastOpc = LastInst.getOpcode();
- if (I == MBB.begin() ||
- !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) {
+ if (I == MBB.begin() || !isJump((--I)->getOpcode())) {
if (LastOpc == AMDGPU::JUMP) {
TBB = LastInst.getOperand(0).getMBB();
return false;
@@ -729,17 +726,19 @@ MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
It != E; ++It) {
if (It->getOpcode() == AMDGPU::CF_ALU ||
It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
- return std::prev(It.base());
+ return It.getReverse();
}
return MBB.end();
}
-unsigned R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
MachineBasicBlock *TBB,
MachineBasicBlock *FBB,
ArrayRef<MachineOperand> Cond,
- const DebugLoc &DL) const {
- assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert(!BytesAdded && "code size not handled");
if (!FBB) {
if (Cond.empty()) {
@@ -779,8 +778,9 @@ unsigned R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
}
}
-unsigned
-R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
// Note : we leave PRED* instructions there.
// They may be needed when predicating instructions.
@@ -910,20 +910,20 @@ R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
bool
-R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
MachineOperand &MO = Cond[1];
switch (MO.getImm()) {
- case OPCODE_IS_ZERO_INT:
- MO.setImm(OPCODE_IS_NOT_ZERO_INT);
+ case AMDGPU::PRED_SETE_INT:
+ MO.setImm(AMDGPU::PRED_SETNE_INT);
break;
- case OPCODE_IS_NOT_ZERO_INT:
- MO.setImm(OPCODE_IS_ZERO_INT);
+ case AMDGPU::PRED_SETNE_INT:
+ MO.setImm(AMDGPU::PRED_SETE_INT);
break;
- case OPCODE_IS_ZERO:
- MO.setImm(OPCODE_IS_NOT_ZERO);
+ case AMDGPU::PRED_SETE:
+ MO.setImm(AMDGPU::PRED_SETNE);
break;
- case OPCODE_IS_NOT_ZERO:
- MO.setImm(OPCODE_IS_ZERO);
+ case AMDGPU::PRED_SETNE:
+ MO.setImm(AMDGPU::PRED_SETE);
break;
default:
return true;
@@ -1160,10 +1160,10 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
- const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
int Offset = -1;
- if (MFI->getNumObjects() == 0) {
+ if (MFI.getNumObjects() == 0) {
return -1;
}
@@ -1195,14 +1195,14 @@ int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
int Offset = 0;
- const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
// Variable sized objects are not supported
- if (MFI->hasVarSizedObjects()) {
+ if (MFI.hasVarSizedObjects()) {
return -1;
}
- if (MFI->getNumObjects() == 0) {
+ if (MFI.getNumObjects() == 0) {
return -1;
}
@@ -1481,11 +1481,3 @@ void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand,
FlagOp.setImm(InstFlags);
}
}
-
-bool R600InstrInfo::isRegisterStore(const MachineInstr &MI) const {
- return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE;
-}
-
-bool R600InstrInfo::isRegisterLoad(const MachineInstr &MI) const {
- return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index feaca98..a280052 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -19,6 +19,14 @@
#include "R600RegisterInfo.h"
namespace llvm {
+
+namespace R600InstrFlags {
+enum : uint64_t {
+ REGISTER_STORE = UINT64_C(1) << 62,
+ REGISTER_LOAD = UINT64_C(1) << 63
+};
+}
+
class AMDGPUTargetMachine;
class DFAPacketizer;
class MachineFunction;
@@ -151,7 +159,7 @@ public:
DFAPacketizer *
CreateTargetScheduleState(const TargetSubtargetInfo &) const override;
- bool ReverseBranchCondition(
+ bool reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const override;
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
@@ -159,11 +167,13 @@ public:
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const override;
- unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
- const DebugLoc &DL) const override;
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
- unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemvoed = nullptr) const override;
bool isPredicated(const MachineInstr &MI) const override;
@@ -301,8 +311,13 @@ public:
void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
// Helper functions that check the opcode for status information
- bool isRegisterStore(const MachineInstr &MI) const;
- bool isRegisterLoad(const MachineInstr &MI) const;
+ bool isRegisterStore(const MachineInstr &MI) const {
+ return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_STORE;
+ }
+
+ bool isRegisterLoad(const MachineInstr &MI) const {
+ return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_LOAD;
+ }
};
namespace AMDGPU {
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
index b6b576d..9210e66 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -242,20 +242,6 @@ def TEX_SHADOW_ARRAY : PatLeaf<
}]
>;
-def TEX_MSAA : PatLeaf<
- (imm),
- [{uint32_t TType = (uint32_t)N->getZExtValue();
- return TType == 14;
- }]
->;
-
-def TEX_ARRAY_MSAA : PatLeaf<
- (imm),
- [{uint32_t TType = (uint32_t)N->getZExtValue();
- return TType == 15;
- }]
->;
-
class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
dag outs, dag ins, string asm, list<dag> pattern> :
InstR600ISA <outs, ins, asm, pattern>,
@@ -283,8 +269,8 @@ class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
}
-class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
- : InstR600ISA <outs, (ins MEMxi:$src_gpr), !strconcat(" ", name), pattern>,
+class VTX_READ <string name, dag outs, list<dag> pattern>
+ : InstR600ISA <outs, (ins MEMxi:$src_gpr, i8imm:$buffer_id), !strconcat(" ", name, ", #$buffer_id"), pattern>,
VTX_WORD1_GPR {
// Static fields
@@ -333,9 +319,9 @@ class LoadParamFrag <PatFrag load_type> : PatFrag <
(cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }]
>;
-def load_param : LoadParamFrag<load>;
-def load_param_exti8 : LoadParamFrag<az_extloadi8>;
-def load_param_exti16 : LoadParamFrag<az_extloadi16>;
+def vtx_id3_az_extloadi8 : LoadParamFrag<az_extloadi8>;
+def vtx_id3_az_extloadi16 : LoadParamFrag<az_extloadi16>;
+def vtx_id3_load : LoadParamFrag<load>;
class LoadVtxId1 <PatFrag load> : PatFrag <
(ops node:$ptr), (load node:$ptr), [{
@@ -450,11 +436,6 @@ def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>;
// Export Instructions
//===----------------------------------------------------------------------===//
-def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
-
-def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType,
- [SDNPHasChain, SDNPSideEffect]>;
-
class ExportWord0 {
field bits<32> Word0;
@@ -500,7 +481,7 @@ class ExportBufWord1 {
}
multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
- def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
+ def : Pat<(R600_EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
(i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
(ExportInst R600_Reg128:$src, imm:$type, imm:$base,
imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0)
@@ -746,6 +727,20 @@ def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
def MOV : R600_1OP <0x19, "MOV", []>;
+
+// This is a hack to get rid of DUMMY_CHAIN nodes.
+// Most DUMMY_CHAINs should be eliminated during legalization, but undef
+// values can sneak in some to selection.
+let isPseudo = 1, isCodeGenOnly = 1 in {
+def DUMMY_CHAIN : AMDGPUInst <
+ (outs),
+ (ins),
+ "DUMMY_CHAIN",
+ [(R600dummy_chain)]
+>;
+} // end let isPseudo = 1, isCodeGenOnly = 1
+
+
let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
@@ -1073,18 +1068,27 @@ class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>;
class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>;
class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
- inst, "MULHI_INT", mulhs
-> {
+ inst, "MULHI_INT", mulhs> {
let Itinerary = TransALU;
}
+
+class MULHI_INT24_Common <bits<11> inst> : R600_2OP_Helper <
+ inst, "MULHI_INT24", AMDGPUmulhi_i24> {
+ let Itinerary = VecALU;
+}
+
class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
- inst, "MULHI", mulhu
-> {
+ inst, "MULHI", mulhu> {
let Itinerary = TransALU;
}
+
+class MULHI_UINT24_Common <bits<11> inst> : R600_2OP_Helper <
+ inst, "MULHI_UINT24", AMDGPUmulhi_u24> {
+ let Itinerary = VecALU;
+}
+
class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
- inst, "MULLO_INT", mul
-> {
+ inst, "MULLO_INT", mul> {
let Itinerary = TransALU;
}
class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> {
@@ -1278,6 +1282,17 @@ let Predicates = [isR600] in {
defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+// Hardcode channel to 0
+// NOTE: LSHR is not available here. LSHR is per family instruction
+def : Pat <
+ (i32 (load_private ADDRIndirect:$addr) ),
+ (R600_RegisterLoad FRAMEri:$addr, (i32 0))
+>;
+def : Pat <
+ (store_private i32:$val, ADDRIndirect:$addr),
+ (R600_RegisterStore i32:$val, FRAMEri:$addr, (i32 0))
+>;
+
//===----------------------------------------------------------------------===//
// Pseudo instructions
@@ -1366,8 +1381,8 @@ def CONST_COPY : Instruction {
} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
def TEX_VTX_CONSTBUF :
- InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr",
- [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>,
+ InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$buffer_id), "VTX_READ_eg $dst, $ptr",
+ [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$buffer_id)))]>,
VTX_WORD1_GPR, VTX_WORD0_eg {
let VC_INST = 0;
@@ -1420,7 +1435,7 @@ def TEX_VTX_CONSTBUF :
}
def TEX_VTX_TEXBUF:
- InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr">,
+ InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$buffer_id), "TEX_VTX_EXPLICIT_READ $dst, $ptr">,
VTX_WORD1_GPR, VTX_WORD0_eg {
let VC_INST = 0;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
index 01105c6..3ca319c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
@@ -12,9 +12,5 @@
using namespace llvm;
-
-// Pin the vtable to this file.
-void R600MachineFunctionInfo::anchor() {}
-
R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF) { }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h
index 04a4436..29ac092 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h
@@ -14,18 +14,13 @@
#define LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H
#include "AMDGPUMachineFunction.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include <vector>
namespace llvm {
class R600MachineFunctionInfo final : public AMDGPUMachineFunction {
- void anchor() override;
public:
R600MachineFunctionInfo(const MachineFunction &MF);
- SmallVector<unsigned, 4> LiveOuts;
- std::vector<unsigned> IndirectRegs;
- unsigned StackSize;
+ unsigned CFStackSize;
};
} // End llvm namespace
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
index 16d5d93..9a67705 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -16,6 +16,7 @@
#define LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H
#include "llvm/CodeGen/MachineScheduler.h"
+#include <vector>
using namespace llvm;
@@ -25,10 +26,10 @@ class R600InstrInfo;
struct R600RegisterInfo;
class R600SchedStrategy final : public MachineSchedStrategy {
- const ScheduleDAGMILive *DAG;
- const R600InstrInfo *TII;
- const R600RegisterInfo *TRI;
- MachineRegisterInfo *MRI;
+ const ScheduleDAGMILive *DAG = nullptr;
+ const R600InstrInfo *TII = nullptr;
+ const R600RegisterInfo *TRI = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
enum InstKind {
IDAlu,
@@ -66,11 +67,8 @@ class R600SchedStrategy final : public MachineSchedStrategy {
int OccupedSlotsMask;
public:
- R600SchedStrategy() :
- DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) {
- }
-
- virtual ~R600SchedStrategy() {}
+ R600SchedStrategy() = default;
+ ~R600SchedStrategy() override = default;
void initialize(ScheduleDAGMI *dag) override;
SUnit *pickNode(bool &IsTopNode) override;
@@ -97,6 +95,6 @@ private:
void MoveUnits(std::vector<SUnit *> &QSrc, std::vector<SUnit *> &QDst);
};
-} // namespace llvm
+} // end namespace llvm
-#endif /* R600MACHINESCHEDULER_H_ */
+#endif // LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index ecae27d..d90008a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -31,22 +31,31 @@
#include "AMDGPUSubtarget.h"
#include "R600Defines.h"
#include "R600InstrInfo.h"
-#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/PassAnalysisSupport.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <utility>
+#include <vector>
using namespace llvm;
#define DEBUG_TYPE "vec-merger"
-namespace {
-
static bool
isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg),
@@ -60,11 +69,14 @@ isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
return false;
}
+namespace {
+
class RegSeqInfo {
public:
MachineInstr *Instr;
DenseMap<unsigned, unsigned> RegToChan;
std::vector<unsigned> UndefReg;
+
RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE);
for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
@@ -76,7 +88,8 @@ public:
RegToChan[MO.getReg()] = Chan;
}
}
- RegSeqInfo() {}
+
+ RegSeqInfo() = default;
bool operator==(const RegSeqInfo &RSI) const {
return RSI.Instr == Instr;
@@ -87,28 +100,30 @@ class R600VectorRegMerger : public MachineFunctionPass {
private:
MachineRegisterInfo *MRI;
const R600InstrInfo *TII;
- bool canSwizzle(const MachineInstr &) const;
+
+ bool canSwizzle(const MachineInstr &MI) const;
bool areAllUsesSwizzeable(unsigned Reg) const;
void SwizzleInput(MachineInstr &,
- const std::vector<std::pair<unsigned, unsigned> > &) const;
- bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *,
- std::vector<std::pair<unsigned, unsigned> > &Remap) const;
+ const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const;
+ bool tryMergeVector(const RegSeqInfo *Untouched, RegSeqInfo *ToMerge,
+ std::vector<std::pair<unsigned, unsigned>> &Remap) const;
bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
- std::vector<std::pair<unsigned, unsigned> > &RemapChan);
+ std::vector<std::pair<unsigned, unsigned>> &RemapChan);
bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
- std::vector<std::pair<unsigned, unsigned> > &RemapChan);
- MachineInstr *RebuildVector(RegSeqInfo *MI,
- const RegSeqInfo *BaseVec,
- const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const;
+ std::vector<std::pair<unsigned, unsigned>> &RemapChan);
+ MachineInstr *RebuildVector(RegSeqInfo *MI, const RegSeqInfo *BaseVec,
+ const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const;
void RemoveMI(MachineInstr *);
void trackRSI(const RegSeqInfo &RSI);
- typedef DenseMap<unsigned, std::vector<MachineInstr *> > InstructionSetMap;
+ typedef DenseMap<unsigned, std::vector<MachineInstr *>> InstructionSetMap;
DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
InstructionSetMap PreviousRegSeqByReg;
InstructionSetMap PreviousRegSeqByUndefCount;
+
public:
static char ID;
+
R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
TII(nullptr) { }
@@ -121,13 +136,15 @@ public:
MachineFunctionPass::getAnalysisUsage(AU);
}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "R600 Vector Registers Merge Pass";
}
bool runOnMachineFunction(MachineFunction &Fn) override;
};
+} // end anonymous namespace.
+
char R600VectorRegMerger::ID = 0;
bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
@@ -144,7 +161,7 @@ bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
}
bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
- RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned> > &Remap)
+ RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned>> &Remap)
const {
unsigned CurrentUndexIdx = 0;
for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
@@ -167,7 +184,7 @@ bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
static
unsigned getReassignedChan(
- const std::vector<std::pair<unsigned, unsigned> > &RemapChan,
+ const std::vector<std::pair<unsigned, unsigned>> &RemapChan,
unsigned Chan) {
for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
if (RemapChan[j].first == Chan)
@@ -178,7 +195,7 @@ unsigned getReassignedChan(
MachineInstr *R600VectorRegMerger::RebuildVector(
RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
- const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
+ const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const {
unsigned Reg = RSI->Instr->getOperand(0).getReg();
MachineBasicBlock::iterator Pos = RSI->Instr;
MachineBasicBlock &MBB = *Pos->getParent();
@@ -200,12 +217,10 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
.addReg(SubReg)
.addImm(Chan);
UpdatedRegToChan[SubReg] = Chan;
- std::vector<unsigned>::iterator ChanPos =
- std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan);
+ std::vector<unsigned>::iterator ChanPos = llvm::find(UpdatedUndef, Chan);
if (ChanPos != UpdatedUndef.end())
UpdatedUndef.erase(ChanPos);
- assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) ==
- UpdatedUndef.end() &&
+ assert(!is_contained(UpdatedUndef, Chan) &&
"UpdatedUndef shouldn't contain Chan more than once!");
DEBUG(dbgs() << " ->"; Tmp->dump(););
(void)Tmp;
@@ -236,17 +251,17 @@ void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
E = PreviousRegSeqByReg.end(); It != E; ++It) {
std::vector<MachineInstr *> &MIs = (*It).second;
- MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
+ MIs.erase(llvm::find(MIs, MI), MIs.end());
}
for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
std::vector<MachineInstr *> &MIs = (*It).second;
- MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
+ MIs.erase(llvm::find(MIs, MI), MIs.end());
}
}
void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
- const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
+ const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const {
unsigned Offset;
if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
Offset = 2;
@@ -274,7 +289,7 @@ bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
RegSeqInfo &CompatibleRSI,
- std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
+ std::vector<std::pair<unsigned, unsigned>> &RemapChan) {
for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(),
MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) {
if (!MOp->isReg())
@@ -294,7 +309,7 @@ bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
RegSeqInfo &CompatibleRSI,
- std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
+ std::vector<std::pair<unsigned, unsigned>> &RemapChan) {
unsigned NeededUndefs = 4 - RSI.UndefReg.size();
if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
return false;
@@ -357,7 +372,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
});
RegSeqInfo CandidateRSI;
- std::vector<std::pair<unsigned, unsigned> > RemapChan;
+ std::vector<std::pair<unsigned, unsigned>> RemapChan;
DEBUG(dbgs() << "Using common slots...\n";);
if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
// Remove CandidateRSI mapping
@@ -381,8 +396,6 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
return false;
}
-}
-
llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
return new R600VectorRegMerger(tm);
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index c848664..5b6dd1e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -47,9 +47,7 @@ public:
MachineFunctionPass::getAnalysisUsage(AU);
}
- const char *getPassName() const override {
- return "R600 Packetizer";
- }
+ StringRef getPassName() const override { return "R600 Packetizer"; }
bool runOnMachineFunction(MachineFunction &Fn) override;
};
@@ -283,7 +281,7 @@ public:
return false;
}
- // We cannot read LDS source registrs from the Trans slot.
+ // We cannot read LDS source registers from the Trans slot.
if (isTransSlot && TII->readsLDSSrcReg(MI))
return false;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 5f182c5..d70f52e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -102,9 +102,7 @@ public:
bool runOnFunction(Function &F) override;
- const char *getPassName() const override {
- return "SI annotate control flow";
- }
+ StringRef getPassName() const override { return "SI annotate control flow"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LoopInfoWrapperPass>();
@@ -148,12 +146,15 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
Break = M.getOrInsertFunction(
BreakIntrinsic, Int64, Int64, (Type *)nullptr);
+ cast<Function>(Break)->setDoesNotAccessMemory();
IfBreak = M.getOrInsertFunction(
IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
+ cast<Function>(IfBreak)->setDoesNotAccessMemory();;
ElseBreak = M.getOrInsertFunction(
ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
+ cast<Function>(ElseBreak)->setDoesNotAccessMemory();
Loop = M.getOrInsertFunction(
LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
@@ -331,6 +332,8 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
BasicBlock *BB = Term->getParent();
llvm::Loop *L = LI->getLoopFor(BB);
+ if (!L)
+ return;
BasicBlock *Target = Term->getSuccessor(1);
PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
@@ -361,7 +364,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
std::vector<BasicBlock*> Preds;
for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
- if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end())
+ if (!is_contained(Latches, *PI))
Preds.push_back(*PI);
}
BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
index 65ceff3..62ebef8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
@@ -38,7 +38,7 @@ public:
static char ID;
SIDebuggerInsertNops() : MachineFunctionPass(ID) { }
- const char *getPassName() const override { return PASS_NAME; }
+ StringRef getPassName() const override { return PASS_NAME; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
index f4b04e3..ff4e321 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -13,76 +13,111 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
#define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
+namespace llvm {
+
namespace SIInstrFlags {
// This needs to be kept in sync with the field bits in InstSI.
-enum {
- SALU = 1 << 3,
- VALU = 1 << 4,
-
- SOP1 = 1 << 5,
- SOP2 = 1 << 6,
- SOPC = 1 << 7,
- SOPK = 1 << 8,
- SOPP = 1 << 9,
-
- VOP1 = 1 << 10,
- VOP2 = 1 << 11,
- VOP3 = 1 << 12,
- VOPC = 1 << 13,
+enum : uint64_t {
+ // Low bits - basic encoding information.
+ SALU = 1 << 0,
+ VALU = 1 << 1,
+
+ // SALU instruction formats.
+ SOP1 = 1 << 2,
+ SOP2 = 1 << 3,
+ SOPC = 1 << 4,
+ SOPK = 1 << 5,
+ SOPP = 1 << 6,
+
+ // VALU instruction formats.
+ VOP1 = 1 << 7,
+ VOP2 = 1 << 8,
+ VOPC = 1 << 9,
+
+ // TODO: Should this be spilt into VOP3 a and b?
+ VOP3 = 1 << 10,
+
+ VINTRP = 1 << 13,
SDWA = 1 << 14,
DPP = 1 << 15,
+ // Memory instruction formats.
MUBUF = 1 << 16,
MTBUF = 1 << 17,
SMRD = 1 << 18,
- DS = 1 << 19,
- MIMG = 1 << 20,
+ MIMG = 1 << 19,
+ EXP = 1 << 20,
FLAT = 1 << 21,
- WQM = 1 << 22,
+ DS = 1 << 22,
+
+ // Pseudo instruction formats.
VGPRSpill = 1 << 23,
- VOPAsmPrefer32Bit = 1 << 24,
- Gather4 = 1 << 25,
- DisableWQM = 1 << 26
+ SGPRSpill = 1 << 24,
+
+ // High bits - other information.
+ VM_CNT = UINT64_C(1) << 32,
+ EXP_CNT = UINT64_C(1) << 33,
+ LGKM_CNT = UINT64_C(1) << 34,
+
+ WQM = UINT64_C(1) << 35,
+ DisableWQM = UINT64_C(1) << 36,
+ Gather4 = UINT64_C(1) << 37,
+ SOPK_ZEXT = UINT64_C(1) << 38,
+ SCALAR_STORE = UINT64_C(1) << 39,
+ FIXED_SIZE = UINT64_C(1) << 40,
+ VOPAsmPrefer32Bit = UINT64_C(1) << 41
+
+};
+
+// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
+// The result is true if any of these tests are true.
+enum ClassFlags {
+ S_NAN = 1 << 0, // Signaling NaN
+ Q_NAN = 1 << 1, // Quiet NaN
+ N_INFINITY = 1 << 2, // Negative infinity
+ N_NORMAL = 1 << 3, // Negative normal
+ N_SUBNORMAL = 1 << 4, // Negative subnormal
+ N_ZERO = 1 << 5, // Negative zero
+ P_ZERO = 1 << 6, // Positive zero
+ P_SUBNORMAL = 1 << 7, // Positive subnormal
+ P_NORMAL = 1 << 8, // Positive normal
+ P_INFINITY = 1 << 9 // Positive infinity
};
}
-namespace llvm {
namespace AMDGPU {
enum OperandType {
- /// Operand with register or 32-bit immediate
- OPERAND_REG_IMM32 = MCOI::OPERAND_FIRST_TARGET,
- /// Operand with register or inline constant
- OPERAND_REG_INLINE_C,
-
- /// Operand with 32-bit immediate that uses the constant bus. The standard
- /// OPERAND_IMMEDIATE should be used for special immediates such as source
- /// modifiers.
- OPERAND_KIMM32
- };
-}
-}
-
-namespace SIInstrFlags {
- enum Flags {
- // First 4 bits are the instruction encoding
- VM_CNT = 1 << 0,
- EXP_CNT = 1 << 1,
- LGKM_CNT = 1 << 2
- };
-
- // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
- // The result is true if any of these tests are true.
- enum ClassFlags {
- S_NAN = 1 << 0, // Signaling NaN
- Q_NAN = 1 << 1, // Quiet NaN
- N_INFINITY = 1 << 2, // Negative infinity
- N_NORMAL = 1 << 3, // Negative normal
- N_SUBNORMAL = 1 << 4, // Negative subnormal
- N_ZERO = 1 << 5, // Negative zero
- P_ZERO = 1 << 6, // Positive zero
- P_SUBNORMAL = 1 << 7, // Positive subnormal
- P_NORMAL = 1 << 8, // Positive normal
- P_INFINITY = 1 << 9 // Positive infinity
+ /// Operands with register or 32-bit immediate
+ OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
+ OPERAND_REG_IMM_INT64,
+ OPERAND_REG_IMM_INT16,
+ OPERAND_REG_IMM_FP32,
+ OPERAND_REG_IMM_FP64,
+ OPERAND_REG_IMM_FP16,
+
+ /// Operands with register or inline constant
+ OPERAND_REG_INLINE_C_INT16,
+ OPERAND_REG_INLINE_C_INT32,
+ OPERAND_REG_INLINE_C_INT64,
+ OPERAND_REG_INLINE_C_FP16,
+ OPERAND_REG_INLINE_C_FP32,
+ OPERAND_REG_INLINE_C_FP64,
+
+ OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
+ OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16,
+
+ OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
+ OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64,
+
+ OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
+ OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
+
+ // Operand for source modifiers for VOP instructions
+ OPERAND_INPUT_MODS,
+
+ /// Operand with 32-bit immediate that uses the constant bus.
+ OPERAND_KIMM32,
+ OPERAND_KIMM16
};
}
@@ -105,7 +140,24 @@ namespace SIOutMods {
};
}
-namespace llvm {
+namespace VGPRIndexMode {
+ enum {
+ SRC0_ENABLE = 1 << 0,
+ SRC1_ENABLE = 1 << 1,
+ SRC2_ENABLE = 1 << 2,
+ DST_ENABLE = 1 << 3
+ };
+}
+
+namespace AMDGPUAsmVariants {
+ enum {
+ DEFAULT = 0,
+ VOP3 = 1,
+ SDWA = 2,
+ DPP = 3
+ };
+}
+
namespace AMDGPU {
namespace EncValues { // Encoding values of enum9/8/7 operands
@@ -126,9 +178,7 @@ enum {
} // namespace EncValues
} // namespace AMDGPU
-} // namespace llvm
-namespace llvm {
namespace AMDGPU {
namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns.
@@ -184,6 +234,13 @@ namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns.
enum Id { // HwRegCode, (6) [5:0]
ID_UNKNOWN_ = -1,
ID_SYMBOLIC_FIRST_ = 1, // There are corresponding symbolic names defined.
+ ID_MODE = 1,
+ ID_STATUS = 2,
+ ID_TRAPSTS = 3,
+ ID_HW_ID = 4,
+ ID_GPR_ALLOC = 5,
+ ID_LDS_ALLOC = 6,
+ ID_IB_STS = 7,
ID_SYMBOLIC_LAST_ = 8,
ID_SHIFT_ = 0,
ID_WIDTH_ = 6,
@@ -205,8 +262,27 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
};
} // namespace Hwreg
+
+namespace SDWA {
+
+enum SdwaSel {
+ BYTE_0 = 0,
+ BYTE_1 = 1,
+ BYTE_2 = 2,
+ BYTE_3 = 3,
+ WORD_0 = 4,
+ WORD_1 = 5,
+ DWORD = 6,
+};
+
+enum DstUnused {
+ UNUSED_PAD = 0,
+ UNUSED_SEXT = 1,
+ UNUSED_PRESERVE = 2,
+};
+
+} // namespace SDWA
} // namespace AMDGPU
-} // namespace llvm
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C
@@ -312,4 +388,6 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
#define R_SPILLED_SGPRS 0x4
#define R_SPILLED_VGPRS 0x8
+} // End namespace llvm
+
#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
index 636750d..d4d3959 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
@@ -37,9 +37,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
- return "SI Fix CF Live Intervals";
- }
+ StringRef getPassName() const override { return "SI Fix CF Live Intervals"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveIntervals>();
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 9e0086b..6a422e7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -68,6 +68,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -82,6 +83,9 @@ using namespace llvm;
namespace {
class SIFixSGPRCopies : public MachineFunctionPass {
+
+ MachineDominatorTree *MDT;
+
public:
static char ID;
@@ -89,11 +93,11 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
- return "SI Fix SGPR copies";
- }
+ StringRef getPassName() const override { return "SI Fix SGPR copies"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -101,8 +105,12 @@ public:
} // End anonymous namespace
-INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE,
- "SI Fix SGPR copies", false, false)
+INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
+ "SI Fix SGPR copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
+ "SI Fix SGPR copies", false, false)
+
char SIFixSGPRCopies::ID = 0;
@@ -236,11 +244,94 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
return true;
}
+static bool phiHasVGPROperands(const MachineInstr &PHI,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *TRI,
+ const SIInstrInfo *TII) {
+
+ for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
+ unsigned Reg = PHI.getOperand(i).getReg();
+ if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
+ return true;
+ }
+ return false;
+}
+static bool phiHasBreakDef(const MachineInstr &PHI,
+ const MachineRegisterInfo &MRI,
+ SmallSet<unsigned, 8> &Visited) {
+
+ for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
+ unsigned Reg = PHI.getOperand(i).getReg();
+ if (Visited.count(Reg))
+ continue;
+
+ Visited.insert(Reg);
+
+ MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);
+ assert(DefInstr);
+ switch (DefInstr->getOpcode()) {
+ default:
+ break;
+ case AMDGPU::SI_BREAK:
+ case AMDGPU::SI_IF_BREAK:
+ case AMDGPU::SI_ELSE_BREAK:
+ return true;
+ case AMDGPU::PHI:
+ if (phiHasBreakDef(*DefInstr, MRI, Visited))
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
+ const TargetRegisterInfo &TRI) {
+ for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
+ E = MBB.end(); I != E; ++I) {
+ if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
+ return true;
+ }
+ return false;
+}
+
+static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
+ const MachineInstr *MoveImm,
+ const SIInstrInfo *TII,
+ unsigned &SMovOp,
+ int64_t &Imm) {
+
+ if (!MoveImm->isMoveImmediate())
+ return false;
+
+ const MachineOperand *ImmOp =
+ TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0);
+ if (!ImmOp->isImm())
+ return false;
+
+ // FIXME: Handle copies with sub-regs.
+ if (Copy->getOperand(0).getSubReg())
+ return false;
+
+ switch (MoveImm->getOpcode()) {
+ default:
+ return false;
+ case AMDGPU::V_MOV_B32_e32:
+ SMovOp = AMDGPU::S_MOV_B32;
+ break;
+ case AMDGPU::V_MOV_B64_PSEUDO:
+ SMovOp = AMDGPU::S_MOV_B64;
+ break;
+ }
+ Imm = ImmOp->getImm();
+ return true;
+}
+
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
SmallVector<MachineInstr *, 16> Worklist;
@@ -264,18 +355,40 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const TargetRegisterClass *SrcRC, *DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
- DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI);
+ MachineInstr *DefMI = MRI.getVRegDef(MI.getOperand(1).getReg());
+ unsigned SMovOp;
+ int64_t Imm;
+ // If we are just copying an immediate, we can replace the copy with
+ // s_mov_b32.
+ if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
+ MI.getOperand(1).ChangeToImmediate(Imm);
+ MI.addImplicitDefUseOperands(MF);
+ MI.setDesc(TII->get(SMovOp));
+ break;
+ }
TII->moveToVALU(MI);
}
break;
}
case AMDGPU::PHI: {
- DEBUG(dbgs() << "Fixing PHI: " << MI);
unsigned Reg = MI.getOperand(0).getReg();
if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
break;
+ // We don't need to fix the PHI if the common dominator of the
+ // two incoming blocks terminates with a uniform branch.
+ if (MI.getNumExplicitOperands() == 5) {
+ MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
+ MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
+
+ MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1);
+ if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) {
+ DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
+ break;
+ }
+ }
+
// If a PHI node defines an SGPR and any of its operands are VGPRs,
// then we need to move it to the VALU.
//
@@ -302,10 +415,6 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
// ...
// use sgpr2
//
- // FIXME: This is OK if the branching decision is made based on an
- // SGPR value.
- bool SGPRBranch = false;
-
// The one exception to this rule is when one of the operands
// is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
// instruction. In this case, there we know the program will
@@ -313,31 +422,12 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
// the first block (where the condition is computed), so there
// is no chance for values to be over-written.
- bool HasBreakDef = false;
- for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
- unsigned Reg = MI.getOperand(i).getReg();
- if (TRI->hasVGPRs(MRI.getRegClass(Reg))) {
- TII->moveToVALU(MI);
- break;
- }
- MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);
- assert(DefInstr);
- switch(DefInstr->getOpcode()) {
-
- case AMDGPU::SI_BREAK:
- case AMDGPU::SI_IF_BREAK:
- case AMDGPU::SI_ELSE_BREAK:
- // If we see a PHI instruction that defines an SGPR, then that PHI
- // instruction has already been considered and should have
- // a *_BREAK as an operand.
- case AMDGPU::PHI:
- HasBreakDef = true;
- break;
- }
- }
-
- if (!SGPRBranch && !HasBreakDef)
+ SmallSet<unsigned, 8> Visited;
+ if (phiHasVGPROperands(MI, MRI, TRI, TII) ||
+ !phiHasBreakDef(MI, MRI, Visited)) {
+ DEBUG(dbgs() << "Fixing PHI: " << MI);
TII->moveToVALU(MI);
+ }
break;
}
case AMDGPU::REG_SEQUENCE: {
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 4ecc0fc..a5c0d49 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -25,9 +25,55 @@ using namespace llvm;
namespace {
+struct FoldCandidate {
+ MachineInstr *UseMI;
+ union {
+ MachineOperand *OpToFold;
+ uint64_t ImmToFold;
+ int FrameIndexToFold;
+ };
+ unsigned char UseOpNo;
+ MachineOperand::MachineOperandType Kind;
+
+ FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
+ UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) {
+ if (FoldOp->isImm()) {
+ ImmToFold = FoldOp->getImm();
+ } else if (FoldOp->isFI()) {
+ FrameIndexToFold = FoldOp->getIndex();
+ } else {
+ assert(FoldOp->isReg());
+ OpToFold = FoldOp;
+ }
+ }
+
+ bool isFI() const {
+ return Kind == MachineOperand::MO_FrameIndex;
+ }
+
+ bool isImm() const {
+ return Kind == MachineOperand::MO_Immediate;
+ }
+
+ bool isReg() const {
+ return Kind == MachineOperand::MO_Register;
+ }
+};
+
class SIFoldOperands : public MachineFunctionPass {
public:
static char ID;
+ MachineRegisterInfo *MRI;
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+
+ void foldOperand(MachineOperand &OpToFold,
+ MachineInstr *UseMI,
+ unsigned UseOpIdx,
+ SmallVectorImpl<FoldCandidate> &FoldList,
+ SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
+
+ void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
public:
SIFoldOperands() : MachineFunctionPass(ID) {
@@ -36,9 +82,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
- return "SI Fold Operands";
- }
+ StringRef getPassName() const override { return "SI Fold Operands"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -46,29 +90,6 @@ public:
}
};
-struct FoldCandidate {
- MachineInstr *UseMI;
- unsigned UseOpNo;
- MachineOperand *OpToFold;
- uint64_t ImmToFold;
-
- FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
- UseMI(MI), UseOpNo(OpNo) {
-
- if (FoldOp->isImm()) {
- OpToFold = nullptr;
- ImmToFold = FoldOp->getImm();
- } else {
- assert(FoldOp->isReg());
- OpToFold = FoldOp;
- }
- }
-
- bool isImm() const {
- return !OpToFold;
- }
-};
-
} // End anonymous namespace.
INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
@@ -78,15 +99,50 @@ char SIFoldOperands::ID = 0;
char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
+// Wrapper around isInlineConstant that understands special cases when
+// instruction types are replaced during operand folding.
+static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
+ const MachineInstr &UseMI,
+ unsigned OpNo,
+ const MachineOperand &OpToFold) {
+ if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
+ return true;
+
+ unsigned Opc = UseMI.getOpcode();
+ switch (Opc) {
+ case AMDGPU::V_MAC_F32_e64:
+ case AMDGPU::V_MAC_F16_e64: {
+ // Special case for mac. Since this is replaced with mad when folded into
+ // src2, we need to check the legality for the final instruction.
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (static_cast<int>(OpNo) == Src2Idx) {
+ bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+ const MCInstrDesc &MadDesc
+ = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+ return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
+ }
+ }
+ default:
+ return false;
+ }
+}
+
FunctionPass *llvm::createSIFoldOperandsPass() {
return new SIFoldOperands();
}
-static bool isSafeToFold(unsigned Opcode) {
- switch(Opcode) {
+static bool isSafeToFold(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
- case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::V_MOV_B64_PSEUDO: {
+ // If there are additional implicit register operands, this may be used for
+ // register indexing so the source register operand isn't simply copied.
+ unsigned NumOps = MI.getDesc().getNumOperands() +
+ MI.getDesc().getNumImplicitUses();
+
+ return MI.getNumOperands() == NumOps;
+ }
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B64:
case AMDGPU::COPY:
@@ -107,6 +163,11 @@ static bool updateOperand(FoldCandidate &Fold,
return true;
}
+ if (Fold.isFI()) {
+ Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
+ return true;
+ }
+
MachineOperand *New = Fold.OpToFold;
if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
TargetRegisterInfo::isVirtualRegister(New->getReg())) {
@@ -119,7 +180,7 @@ static bool updateOperand(FoldCandidate &Fold,
return false;
}
-static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
+static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
const MachineInstr *MI) {
for (auto Candidate : FoldList) {
if (Candidate.UseMI == MI)
@@ -128,19 +189,21 @@ static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
return false;
}
-static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MachineInstr *MI, unsigned OpNo,
MachineOperand *OpToFold,
const SIInstrInfo *TII) {
if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
- // Special case for v_mac_f32_e64 if we are trying to fold into src2
+ // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
unsigned Opc = MI->getOpcode();
- if (Opc == AMDGPU::V_MAC_F32_e64 &&
+ if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
(int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
- // Check if changing this to a v_mad_f32 instruction will allow us to
- // fold the operand.
- MI->setDesc(TII->get(AMDGPU::V_MAD_F32));
+ bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+
+ // Check if changing this to a v_mad_{f16, f32} instruction will allow us
+ // to fold the operand.
+ MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16));
bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
if (FoldAsMAD) {
MI->untieRegOperand(OpNo);
@@ -149,6 +212,13 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
MI->setDesc(TII->get(Opc));
}
+ // Special case for s_setreg_b32
+ if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
+ MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
+ FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+ return true;
+ }
+
// If we are already folding into another operand of MI, then
// we can't commute the instruction, otherwise we risk making the
// other fold illegal.
@@ -188,108 +258,432 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
return true;
}
-static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
- unsigned UseOpIdx,
- std::vector<FoldCandidate> &FoldList,
- SmallVectorImpl<MachineInstr *> &CopiesToReplace,
- const SIInstrInfo *TII, const SIRegisterInfo &TRI,
- MachineRegisterInfo &MRI) {
+// If the use operand doesn't care about the value, this may be an operand only
+// used for register indexing, in which case it is unsafe to fold.
+static bool isUseSafeToFold(const MachineInstr &MI,
+ const MachineOperand &UseMO) {
+ return !UseMO.isUndef();
+ //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
+}
+
+void SIFoldOperands::foldOperand(
+ MachineOperand &OpToFold,
+ MachineInstr *UseMI,
+ unsigned UseOpIdx,
+ SmallVectorImpl<FoldCandidate> &FoldList,
+ SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+ if (!isUseSafeToFold(*UseMI, UseOp))
+ return;
+
// FIXME: Fold operands with subregs.
- if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
- UseOp.isImplicit())) {
+ if (UseOp.isReg() && OpToFold.isReg()) {
+ if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
+ return;
+
+ // Don't fold subregister extracts into tied operands, only if it is a full
+ // copy since a subregister use tied to a full register def doesn't really
+ // make sense. e.g. don't fold:
+ //
+ // %vreg1 = COPY %vreg0:sub1
+ // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0>
+ //
+ // into
+ // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0>
+ if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
+ return;
+ }
+
+ // Special case for REG_SEQUENCE: We can't fold literals into
+ // REG_SEQUENCE instructions, so we have to fold them into the
+ // uses of REG_SEQUENCE.
+ if (UseMI->isRegSequence()) {
+ unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
+ unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
+
+ for (MachineRegisterInfo::use_iterator
+ RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
+ RSUse != RSE; ++RSUse) {
+
+ MachineInstr *RSUseMI = RSUse->getParent();
+ if (RSUse->getSubReg() != RegSeqDstSubReg)
+ continue;
+
+ foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
+ CopiesToReplace);
+ }
+
return;
}
+
bool FoldingImm = OpToFold.isImm();
- APInt Imm;
- if (FoldingImm) {
+ // In order to fold immediates into copies, we need to change the
+ // copy to a MOV.
+ if (FoldingImm && UseMI->isCopy()) {
+ unsigned DestReg = UseMI->getOperand(0).getReg();
+ const TargetRegisterClass *DestRC
+ = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+ MRI->getRegClass(DestReg) :
+ TRI->getPhysRegClass(DestReg);
+
+ unsigned MovOp = TII->getMovOpcode(DestRC);
+ if (MovOp == AMDGPU::COPY)
+ return;
+
+ UseMI->setDesc(TII->get(MovOp));
+ CopiesToReplace.push_back(UseMI);
+ } else {
+ const MCInstrDesc &UseDesc = UseMI->getDesc();
+
+ // Don't fold into target independent nodes. Target independent opcodes
+ // don't have defined register classes.
+ if (UseDesc.isVariadic() ||
+ UseDesc.OpInfo[UseOpIdx].RegClass == -1)
+ return;
+ }
+
+ if (!FoldingImm) {
+ tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
+
+ // FIXME: We could try to change the instruction from 64-bit to 32-bit
+ // to enable more folding opportunites. The shrink operands pass
+ // already does this.
+ return;
+ }
+
+
+ const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
+ const TargetRegisterClass *FoldRC =
+ TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
+
+ APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
+ OpToFold.getImm());
+
+ // Split 64-bit constants into 32-bits for folding.
+ if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
unsigned UseReg = UseOp.getReg();
const TargetRegisterClass *UseRC
= TargetRegisterInfo::isVirtualRegister(UseReg) ?
- MRI.getRegClass(UseReg) :
- TRI.getPhysRegClass(UseReg);
-
- Imm = APInt(64, OpToFold.getImm());
+ MRI->getRegClass(UseReg) :
+ TRI->getPhysRegClass(UseReg);
- const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode());
- const TargetRegisterClass *FoldRC =
- TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
+ assert(Imm.getBitWidth() == 64);
- // Split 64-bit constants into 32-bits for folding.
- if (FoldRC->getSize() == 8 && UseOp.getSubReg()) {
- if (UseRC->getSize() != 8)
- return;
+ if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
+ return;
- if (UseOp.getSubReg() == AMDGPU::sub0) {
- Imm = Imm.getLoBits(32);
- } else {
- assert(UseOp.getSubReg() == AMDGPU::sub1);
- Imm = Imm.getHiBits(32);
- }
+ if (UseOp.getSubReg() == AMDGPU::sub0) {
+ Imm = Imm.getLoBits(32);
+ } else {
+ assert(UseOp.getSubReg() == AMDGPU::sub1);
+ Imm = Imm.getHiBits(32);
}
+ }
- // In order to fold immediates into copies, we need to change the
- // copy to a MOV.
- if (UseMI->getOpcode() == AMDGPU::COPY) {
- unsigned DestReg = UseMI->getOperand(0).getReg();
- const TargetRegisterClass *DestRC
- = TargetRegisterInfo::isVirtualRegister(DestReg) ?
- MRI.getRegClass(DestReg) :
- TRI.getPhysRegClass(DestReg);
-
- unsigned MovOp = TII->getMovOpcode(DestRC);
- if (MovOp == AMDGPU::COPY)
- return;
-
- UseMI->setDesc(TII->get(MovOp));
- CopiesToReplace.push_back(UseMI);
+ MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+ tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+}
+
+static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
+ uint32_t LHS, uint32_t RHS) {
+ switch (Opcode) {
+ case AMDGPU::V_AND_B32_e64:
+ case AMDGPU::V_AND_B32_e32:
+ case AMDGPU::S_AND_B32:
+ Result = LHS & RHS;
+ return true;
+ case AMDGPU::V_OR_B32_e64:
+ case AMDGPU::V_OR_B32_e32:
+ case AMDGPU::S_OR_B32:
+ Result = LHS | RHS;
+ return true;
+ case AMDGPU::V_XOR_B32_e64:
+ case AMDGPU::V_XOR_B32_e32:
+ case AMDGPU::S_XOR_B32:
+ Result = LHS ^ RHS;
+ return true;
+ case AMDGPU::V_LSHL_B32_e64:
+ case AMDGPU::V_LSHL_B32_e32:
+ case AMDGPU::S_LSHL_B32:
+ // The instruction ignores the high bits for out of bounds shifts.
+ Result = LHS << (RHS & 31);
+ return true;
+ case AMDGPU::V_LSHLREV_B32_e64:
+ case AMDGPU::V_LSHLREV_B32_e32:
+ Result = RHS << (LHS & 31);
+ return true;
+ case AMDGPU::V_LSHR_B32_e64:
+ case AMDGPU::V_LSHR_B32_e32:
+ case AMDGPU::S_LSHR_B32:
+ Result = LHS >> (RHS & 31);
+ return true;
+ case AMDGPU::V_LSHRREV_B32_e64:
+ case AMDGPU::V_LSHRREV_B32_e32:
+ Result = RHS >> (LHS & 31);
+ return true;
+ case AMDGPU::V_ASHR_I32_e64:
+ case AMDGPU::V_ASHR_I32_e32:
+ case AMDGPU::S_ASHR_I32:
+ Result = static_cast<int32_t>(LHS) >> (RHS & 31);
+ return true;
+ case AMDGPU::V_ASHRREV_I32_e64:
+ case AMDGPU::V_ASHRREV_I32_e32:
+ Result = static_cast<int32_t>(RHS) >> (LHS & 31);
+ return true;
+ default:
+ return false;
+ }
+}
+
+static unsigned getMovOpc(bool IsScalar) {
+ return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+}
+
+/// Remove any leftover implicit operands from mutating the instruction. e.g.
+/// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
+/// anymore.
+static void stripExtraCopyOperands(MachineInstr &MI) {
+ const MCInstrDesc &Desc = MI.getDesc();
+ unsigned NumOps = Desc.getNumOperands() +
+ Desc.getNumImplicitUses() +
+ Desc.getNumImplicitDefs();
+
+ for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
+ MI.RemoveOperand(I);
+}
+
+static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
+ MI.setDesc(NewDesc);
+ stripExtraCopyOperands(MI);
+}
+
+static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
+ MachineOperand &Op) {
+ if (Op.isReg()) {
+ // If this has a subregister, it obviously is a register source.
+ if (Op.getSubReg() != AMDGPU::NoSubRegister)
+ return &Op;
+
+ MachineInstr *Def = MRI.getVRegDef(Op.getReg());
+ if (Def->isMoveImmediate()) {
+ MachineOperand &ImmSrc = Def->getOperand(1);
+ if (ImmSrc.isImm())
+ return &ImmSrc;
}
}
- // Special case for REG_SEQUENCE: We can't fold literals into
- // REG_SEQUENCE instructions, so we have to fold them into the
- // uses of REG_SEQUENCE.
- if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) {
- unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
- unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
+ return &Op;
+}
- for (MachineRegisterInfo::use_iterator
- RSUse = MRI.use_begin(RegSeqDstReg),
- RSE = MRI.use_end(); RSUse != RSE; ++RSUse) {
+// Try to simplify operations with a constant that may appear after instruction
+// selection.
+// TODO: See if a frame index with a fixed offset can fold.
+static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
+ const SIInstrInfo *TII,
+ MachineInstr *MI,
+ MachineOperand *ImmOp) {
+ unsigned Opc = MI->getOpcode();
+ if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
+ Opc == AMDGPU::S_NOT_B32) {
+ MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
+ mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
+ return true;
+ }
- MachineInstr *RSUseMI = RSUse->getParent();
- if (RSUse->getSubReg() != RegSeqDstSubReg)
- continue;
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ if (Src1Idx == -1)
+ return false;
- foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
- CopiesToReplace, TII, TRI, MRI);
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
+ MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
+
+ if (!Src0->isImm() && !Src1->isImm())
+ return false;
+
+ // and k0, k1 -> v_mov_b32 (k0 & k1)
+ // or k0, k1 -> v_mov_b32 (k0 | k1)
+ // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
+ if (Src0->isImm() && Src1->isImm()) {
+ int32_t NewImm;
+ if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
+ return false;
+
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
+
+ // Be careful to change the right operand, src0 may belong to a different
+ // instruction.
+ MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
+ MI->RemoveOperand(Src1Idx);
+ mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
+ return true;
+ }
+
+ if (!MI->isCommutable())
+ return false;
+
+ if (Src0->isImm() && !Src1->isImm()) {
+ std::swap(Src0, Src1);
+ std::swap(Src0Idx, Src1Idx);
+ }
+
+ int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
+ if (Opc == AMDGPU::V_OR_B32_e64 ||
+ Opc == AMDGPU::V_OR_B32_e32 ||
+ Opc == AMDGPU::S_OR_B32) {
+ if (Src1Val == 0) {
+ // y = or x, 0 => y = copy x
+ MI->RemoveOperand(Src1Idx);
+ mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ } else if (Src1Val == -1) {
+ // y = or x, -1 => y = v_mov_b32 -1
+ MI->RemoveOperand(Src1Idx);
+ mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
+ } else
+ return false;
+
+ return true;
+ }
+
+ if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
+ MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
+ MI->getOpcode() == AMDGPU::S_AND_B32) {
+ if (Src1Val == 0) {
+ // y = and x, 0 => y = v_mov_b32 0
+ MI->RemoveOperand(Src0Idx);
+ mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
+ } else if (Src1Val == -1) {
+ // y = and x, -1 => y = copy x
+ MI->RemoveOperand(Src1Idx);
+ mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ stripExtraCopyOperands(*MI);
+ } else
+ return false;
+
+ return true;
+ }
+
+ if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
+ MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
+ MI->getOpcode() == AMDGPU::S_XOR_B32) {
+ if (Src1Val == 0) {
+ // y = xor x, 0 => y = copy x
+ MI->RemoveOperand(Src1Idx);
+ mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ return true;
}
- return;
}
- const MCInstrDesc &UseDesc = UseMI->getDesc();
+ return false;
+}
- // Don't fold into target independent nodes. Target independent opcodes
- // don't have defined register classes.
- if (UseDesc.isVariadic() ||
- UseDesc.OpInfo[UseOpIdx].RegClass == -1)
- return;
+void SIFoldOperands::foldInstOperand(MachineInstr &MI,
+ MachineOperand &OpToFold) const {
+ // We need mutate the operands of new mov instructions to add implicit
+ // uses of EXEC, but adding them invalidates the use_iterator, so defer
+ // this.
+ SmallVector<MachineInstr *, 4> CopiesToReplace;
+ SmallVector<FoldCandidate, 4> FoldList;
+ MachineOperand &Dst = MI.getOperand(0);
+ bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
if (FoldingImm) {
- MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
- tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
- return;
- }
+ unsigned NumLiteralUses = 0;
+ MachineOperand *NonInlineUse = nullptr;
+ int NonInlineUseOpNo = -1;
+
+ MachineRegisterInfo::use_iterator NextUse, NextInstUse;
+ for (MachineRegisterInfo::use_iterator
+ Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+ Use != E; Use = NextUse) {
+ NextUse = std::next(Use);
+ MachineInstr *UseMI = Use->getParent();
+ unsigned OpNo = Use.getOperandNo();
+
+ // Folding the immediate may reveal operations that can be constant
+ // folded or replaced with a copy. This can happen for example after
+ // frame indices are lowered to constants or from splitting 64-bit
+ // constants.
+ //
+ // We may also encounter cases where one or both operands are
+ // immediates materialized into a register, which would ordinarily not
+ // be folded due to multiple uses or operand constraints.
+
+ if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
+ DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
+
+ // Some constant folding cases change the same immediate's use to a new
+ // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
+ // again. The same constant folded instruction could also have a second
+ // use operand.
+ NextUse = MRI->use_begin(Dst.getReg());
+ continue;
+ }
+
+ // Try to fold any inline immediate uses, and then only fold other
+ // constants if they have one use.
+ //
+ // The legality of the inline immediate must be checked based on the use
+ // operand, not the defining instruction, because 32-bit instructions
+ // with 32-bit inline immediate sources may be used to materialize
+ // constants used in 16-bit operands.
+ //
+ // e.g. it is unsafe to fold:
+ // s_mov_b32 s0, 1.0 // materializes 0x3f800000
+ // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
+
+ // Folding immediates with more than one use will increase program size.
+ // FIXME: This will also reduce register usage, which may be better
+ // in some cases. A better heuristic is needed.
+ if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
+ foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+ } else {
+ if (++NumLiteralUses == 1) {
+ NonInlineUse = &*Use;
+ NonInlineUseOpNo = OpNo;
+ }
+ }
+ }
+
+ if (NumLiteralUses == 1) {
+ MachineInstr *UseMI = NonInlineUse->getParent();
+ foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
+ }
+ } else {
+ // Folding register.
+ for (MachineRegisterInfo::use_iterator
+ Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+ Use != E; ++Use) {
+ MachineInstr *UseMI = Use->getParent();
- tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
+ foldOperand(OpToFold, UseMI, Use.getOperandNo(),
+ FoldList, CopiesToReplace);
+ }
+ }
- // FIXME: We could try to change the instruction from 64-bit to 32-bit
- // to enable more folding opportunites. The shrink operands pass
- // already does this.
- return;
+ MachineFunction *MF = MI.getParent()->getParent();
+ // Make sure we add EXEC uses to any new v_mov instructions created.
+ for (MachineInstr *Copy : CopiesToReplace)
+ Copy->addImplicitDefUseOperands(*MF);
+
+ for (FoldCandidate &Fold : FoldList) {
+ if (updateOperand(Fold, *TRI)) {
+ // Clear kill flags.
+ if (Fold.isReg()) {
+ assert(Fold.OpToFold && Fold.OpToFold->isReg());
+ // FIXME: Probably shouldn't bother trying to fold if not an
+ // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+ // copies.
+ MRI->clearKillFlags(Fold.OpToFold->getReg());
+ }
+ DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
+ static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+ }
+ }
}
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
@@ -298,12 +692,12 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
+ BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;
MachineBasicBlock::iterator I, Next;
@@ -311,25 +705,16 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
Next = std::next(I);
MachineInstr &MI = *I;
- if (!isSafeToFold(MI.getOpcode()))
+ if (!isSafeToFold(MI))
continue;
- unsigned OpSize = TII->getOpSize(MI, 1);
MachineOperand &OpToFold = MI.getOperand(1);
- bool FoldingImm = OpToFold.isImm();
+ bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
- // FIXME: We could also be folding things like FrameIndexes and
- // TargetIndexes.
+ // FIXME: We could also be folding things like TargetIndexes.
if (!FoldingImm && !OpToFold.isReg())
continue;
- // Folding immediates with more than one use will increase program size.
- // FIXME: This will also reduce register usage, which may be better
- // in some cases. A better heuristic is needed.
- if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) &&
- !MRI.hasOneUse(MI.getOperand(0).getReg()))
- continue;
-
if (OpToFold.isReg() &&
!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
continue;
@@ -345,40 +730,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
!TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
continue;
- // We need mutate the operands of new mov instructions to add implicit
- // uses of EXEC, but adding them invalidates the use_iterator, so defer
- // this.
- SmallVector<MachineInstr *, 4> CopiesToReplace;
-
- std::vector<FoldCandidate> FoldList;
- for (MachineRegisterInfo::use_iterator
- Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
- Use != E; ++Use) {
-
- MachineInstr *UseMI = Use->getParent();
-
- foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
- CopiesToReplace, TII, TRI, MRI);
- }
-
- // Make sure we add EXEC uses to any new v_mov instructions created.
- for (MachineInstr *Copy : CopiesToReplace)
- Copy->addImplicitDefUseOperands(MF);
-
- for (FoldCandidate &Fold : FoldList) {
- if (updateOperand(Fold, TRI)) {
- // Clear kill flags.
- if (!Fold.isImm()) {
- assert(Fold.OpToFold && Fold.OpToFold->isReg());
- // FIXME: Probably shouldn't bother trying to fold if not an
- // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
- // copies.
- MRI.clearKillFlags(Fold.OpToFold->getReg());
- }
- DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
- Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
- }
- }
+ foldInstOperand(MI, OpToFold);
}
}
return false;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 03b11f0..0b57155 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -21,20 +21,168 @@
using namespace llvm;
-static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
- const MachineFrameInfo *FrameInfo) {
- return FuncInfo->hasSpilledSGPRs() &&
- (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
-}
-
-static ArrayRef<MCPhysReg> getAllSGPR128() {
+static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF,
+ const SIRegisterInfo *TRI) {
return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
- AMDGPU::SGPR_128RegClass.getNumRegs());
+ TRI->getMaxNumSGPRs(MF) / 4);
}
-static ArrayRef<MCPhysReg> getAllSGPRs() {
+static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF,
+ const SIRegisterInfo *TRI) {
return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
- AMDGPU::SGPR_32RegClass.getNumRegs());
+ TRI->getMaxNumSGPRs(MF));
+}
+
+void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
+ const SIRegisterInfo* TRI,
+ MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ // We don't need this if we only have spills since there is no user facing
+ // scratch.
+
+ // TODO: If we know we don't have flat instructions earlier, we can omit
+ // this from the input registers.
+ //
+ // TODO: We only need to know if we access scratch space through a flat
+ // pointer. Because we only detect if flat instructions are used at all,
+ // this will be used more often than necessary on VI.
+
+ // Debug location must be unknown since the first debug location is used to
+ // determine the end of the prologue.
+ DebugLoc DL;
+ MachineBasicBlock::iterator I = MBB.begin();
+
+ unsigned FlatScratchInitReg
+ = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MRI.addLiveIn(FlatScratchInitReg);
+ MBB.addLiveIn(FlatScratchInitReg);
+
+ // Copy the size in bytes.
+ unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
+ .addReg(FlatScrInitHi, RegState::Kill);
+
+ unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+
+ // Add wave offset in bytes to private base offset.
+ // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
+ .addReg(FlatScrInitLo)
+ .addReg(ScratchWaveOffsetReg);
+
+ // Convert offset to 256-byte units.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
+ .addReg(FlatScrInitLo, RegState::Kill)
+ .addImm(8);
+}
+
+unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
+ const SISubtarget &ST,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ SIMachineFunctionInfo *MFI,
+ MachineFunction &MF) const {
+
+ // We need to insert initialization of the scratch resource descriptor.
+ unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+ if (ScratchRsrcReg == AMDGPU::NoRegister)
+ return AMDGPU::NoRegister;
+
+ if (ST.hasSGPRInitBug() ||
+ ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
+ return ScratchRsrcReg;
+
+ // We reserved the last registers for this. Shift it down to the end of those
+ // which were actually used.
+ //
+ // FIXME: It might be safer to use a pseudoregister before replacement.
+
+ // FIXME: We should be able to eliminate unused input registers. We only
+ // cannot do this for the resources required for scratch access. For now we
+ // skip over user SGPRs and may leave unused holes.
+
+ // We find the resource first because it has an alignment requirement.
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
+ ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI);
+ AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
+
+ // Skip the last 2 elements because the last one is reserved for VCC, and
+ // this is the 2nd to last element already.
+ for (MCPhysReg Reg : AllSGPR128s) {
+ // Pick the first unallocated one. Make sure we don't clobber the other
+ // reserved input we needed.
+ if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+ //assert(MRI.isAllocatable(Reg));
+ MRI.replaceRegWith(ScratchRsrcReg, Reg);
+ MFI->setScratchRSrcReg(Reg);
+ return Reg;
+ }
+ }
+
+ return ScratchRsrcReg;
+}
+
+unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
+ const SISubtarget &ST,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ SIMachineFunctionInfo *MFI,
+ MachineFunction &MF) const {
+ unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+ if (ST.hasSGPRInitBug() ||
+ ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF))
+ return ScratchWaveOffsetReg;
+
+ unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+
+ ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI);
+ if (NumPreloaded > AllSGPRs.size())
+ return ScratchWaveOffsetReg;
+
+ AllSGPRs = AllSGPRs.slice(NumPreloaded);
+
+ // We need to drop register from the end of the list that we cannot use
+ // for the scratch wave offset.
+ // + 2 s102 and s103 do not exist on VI.
+ // + 2 for vcc
+ // + 2 for xnack_mask
+ // + 2 for flat_scratch
+ // + 4 for registers reserved for scratch resource register
+ // + 1 for register reserved for scratch wave offset. (By exluding this
+ // register from the list to consider, it means that when this
+ // register is being used for the scratch wave offset and there
+ // are no other free SGPRs, then the value will stay in this register.
+ // ----
+ // 13
+ if (AllSGPRs.size() < 13)
+ return ScratchWaveOffsetReg;
+
+ for (MCPhysReg Reg : AllSGPRs.drop_back(13)) {
+ // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+ // scratch descriptor, since we haven’t added its uses yet.
+ if (!MRI.isPhysRegUsed(Reg)) {
+ if (!MRI.isAllocatable(Reg) ||
+ TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
+ continue;
+
+ MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+ MFI->setScratchWaveOffsetReg(Reg);
+ return Reg;
+ }
+ }
+
+ return ScratchWaveOffsetReg;
}
void SIFrameLowering::emitPrologue(MachineFunction &MF,
@@ -45,9 +193,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
if (ST.debuggerEmitPrologue())
emitDebuggerPrologue(MF, MBB);
- if (!MF.getFrameInfo()->hasStackObjects())
- return;
-
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -57,200 +202,159 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
//
// FIXME: We should be cleaning up these unused SGPR spill frame indices
// somewhere.
- if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
- return;
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
- MachineBasicBlock::iterator I = MBB.begin();
-
- // We need to insert initialization of the scratch resource descriptor.
- unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
- assert(ScratchRsrcReg != AMDGPU::NoRegister);
-
- unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
- assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
- unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ unsigned ScratchRsrcReg
+ = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
+ unsigned ScratchWaveOffsetReg
+ = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
- unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
- if (ST.isAmdHsaOS()) {
- PreloadedPrivateBufferReg = TRI->getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ if (ScratchRsrcReg == AMDGPU::NoRegister) {
+ assert(ScratchWaveOffsetReg == AMDGPU::NoRegister);
+ return;
}
- if (MFI->hasFlatScratchInit()) {
- // We don't need this if we only have spills since there is no user facing
- // scratch.
-
- // TODO: If we know we don't have flat instructions earlier, we can omit
- // this from the input registers.
- //
- // TODO: We only need to know if we access scratch space through a flat
- // pointer. Because we only detect if flat instructions are used at all,
- // this will be used more often than necessary on VI.
-
- // Debug location must be unknown since the first debug location is used to
- // determine the end of the prologue.
- DebugLoc DL;
-
- unsigned FlatScratchInitReg
- = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
+ assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
- MRI.addLiveIn(FlatScratchInitReg);
- MBB.addLiveIn(FlatScratchInitReg);
+ // We need to do the replacement of the private segment buffer and wave offset
+ // register even if there are no stack objects. There could be stores to undef
+ // or a constant without an associated object.
- // Copy the size in bytes.
- unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO)
- .addReg(FlatScrInitHi, RegState::Kill);
+ // FIXME: We still have implicit uses on SGPR spill instructions in case they
+ // need to spill to vector memory. It's likely that will not happen, but at
+ // this point it appears we need the setup. This part of the prolog should be
+ // emitted after frame indices are eliminated.
- unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+ if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
+ emitFlatScratchInit(TII, TRI, MF, MBB);
- // Add wave offset in bytes to private base offset.
- // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
- .addReg(FlatScrInitLo)
- .addReg(ScratchWaveOffsetReg);
+ // We need to insert initialization of the scratch resource descriptor.
+ unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
- // Convert offset to 256-byte units.
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
- .addReg(FlatScrInitLo, RegState::Kill)
- .addImm(8);
- }
- // If we reserved the original input registers, we don't need to copy to the
- // reserved registers.
- if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
- // We should always reserve these 5 registers at the same time.
- assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
- "scratch wave offset and private segment buffer inconsistent");
- return;
+ unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
+ if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
+ PreloadedPrivateBufferReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
}
+ bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg);
+ bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg);
// We added live-ins during argument lowering, but since they were not used
// they were deleted. We're adding the uses now, so add them back.
- MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
- MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+ if (OffsetRegUsed) {
+ assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister &&
+ "scratch wave offset input is required");
+ MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+ MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+ }
- if (ST.isAmdHsaOS()) {
+ if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
+ assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF));
MRI.addLiveIn(PreloadedPrivateBufferReg);
MBB.addLiveIn(PreloadedPrivateBufferReg);
}
- if (!ST.hasSGPRInitBug()) {
- // We reserved the last registers for this. Shift it down to the end of those
- // which were actually used.
- //
- // FIXME: It might be safer to use a pseudoregister before replacement.
-
- // FIXME: We should be able to eliminate unused input registers. We only
- // cannot do this for the resources required for scratch access. For now we
- // skip over user SGPRs and may leave unused holes.
-
- // We find the resource first because it has an alignment requirement.
- if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
- unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
- // Skip the last 2 elements because the last one is reserved for VCC, and
- // this is the 2nd to last element already.
- for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
- // Pick the first unallocated one. Make sure we don't clobber the other
- // reserved input we needed.
- if (!MRI.isPhysRegUsed(Reg)) {
- assert(MRI.isAllocatable(Reg));
- MRI.replaceRegWith(ScratchRsrcReg, Reg);
- ScratchRsrcReg = Reg;
- MFI->setScratchRSrcReg(ScratchRsrcReg);
- break;
- }
- }
- }
+ // Make the register selected live throughout the function.
+ for (MachineBasicBlock &OtherBB : MF) {
+ if (&OtherBB == &MBB)
+ continue;
- if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
-
- // We need to drop register from the end of the list that we cannot use
- // for the scratch wave offset.
- // + 2 s102 and s103 do not exist on VI.
- // + 2 for vcc
- // + 2 for xnack_mask
- // + 2 for flat_scratch
- // + 4 for registers reserved for scratch resource register
- // + 1 for register reserved for scratch wave offset. (By exluding this
- // register from the list to consider, it means that when this
- // register is being used for the scratch wave offset and there
- // are no other free SGPRs, then the value will stay in this register.
- // ----
- // 13
- for (MCPhysReg Reg : getAllSGPRs().drop_back(13).slice(NumPreloaded)) {
- // Pick the first unallocated SGPR. Be careful not to pick an alias of the
- // scratch descriptor, since we haven’t added its uses yet.
- if (!MRI.isPhysRegUsed(Reg)) {
- if (!MRI.isAllocatable(Reg) ||
- TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
- continue;
-
- MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
- ScratchWaveOffsetReg = Reg;
- MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
- break;
- }
- }
- }
+ if (OffsetRegUsed)
+ OtherBB.addLiveIn(ScratchWaveOffsetReg);
+
+ if (ResourceRegUsed)
+ OtherBB.addLiveIn(ScratchRsrcReg);
}
+ DebugLoc DL;
+ MachineBasicBlock::iterator I = MBB.begin();
- assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
+ // If we reserved the original input registers, we don't need to copy to the
+ // reserved registers.
- const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
- DebugLoc DL;
+ bool CopyBuffer = ResourceRegUsed &&
+ PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
+ ST.isAmdCodeObjectV2(MF) &&
+ ScratchRsrcReg != PreloadedPrivateBufferReg;
+
+ // This needs to be careful of the copying order to avoid overwriting one of
+ // the input registers before it's been copied to it's final
+ // destination. Usually the offset should be copied first.
+ bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
+ ScratchWaveOffsetReg);
+ if (CopyBuffer && CopyBufferFirst) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+ .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+ }
- if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
- // Make sure we emit the copy for the offset first. We may have chosen to copy
- // the buffer resource into a register that aliases the input offset register.
- BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg)
+ if (OffsetRegUsed &&
+ PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
.addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
}
- if (ST.isAmdHsaOS()) {
- // Insert copies from argument register.
- assert(
- !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
- !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
-
- unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
- unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3);
-
- unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1);
- unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3);
+ if (CopyBuffer && !CopyBufferFirst) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+ .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+ }
- const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64);
+ if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) {
+ assert(!ST.isAmdCodeObjectV2(MF));
+ const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
- BuildMI(MBB, I, DL, SMovB64, Rsrc01)
- .addReg(Lo, RegState::Kill);
- BuildMI(MBB, I, DL, SMovB64, Rsrc23)
- .addReg(Hi, RegState::Kill);
- } else {
- unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
- unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
// Use relocations to get the pointer, and setup the other bits manually.
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
- BuildMI(MBB, I, DL, SMovB32, Rsrc0)
- .addExternalSymbol("SCRATCH_RSRC_DWORD0")
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
- BuildMI(MBB, I, DL, SMovB32, Rsrc1)
- .addExternalSymbol("SCRATCH_RSRC_DWORD1")
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ if (MFI->hasPrivateMemoryInputPtr()) {
+ unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+
+ if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
+ const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
+
+ BuildMI(MBB, I, DL, Mov64, Rsrc01)
+ .addReg(PreloadedPrivateBufferReg)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ } else {
+ const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
+
+ PointerType *PtrTy =
+ PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()),
+ AMDGPUAS::CONSTANT_ADDRESS);
+ MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+ auto MMO = MF.getMachineMemOperand(PtrInfo,
+ MachineMemOperand::MOLoad |
+ MachineMemOperand::MOInvariant |
+ MachineMemOperand::MODereferenceable,
+ 0, 0);
+ BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
+ .addReg(PreloadedPrivateBufferReg)
+ .addImm(0) // offset
+ .addImm(0) // glc
+ .addMemOperand(MMO)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ }
+ } else {
+ unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc0)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc1)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ }
BuildMI(MBB, I, DL, SMovB32, Rsrc2)
.addImm(Rsrc23 & 0xffffffff)
@@ -260,15 +364,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
}
-
- // Make the register selected live throughout the function.
- for (MachineBasicBlock &OtherBB : MF) {
- if (&OtherBB == &MBB)
- continue;
-
- OtherBB.addLiveIn(ScratchRsrcReg);
- OtherBB.addLiveIn(ScratchWaveOffsetReg);
- }
}
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -279,20 +374,20 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
void SIFrameLowering::processFunctionBeforeFrameFinalized(
MachineFunction &MF,
RegScavenger *RS) const {
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
- if (!MFI->hasStackObjects())
+ if (!MFI.hasStackObjects())
return;
- bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects();
+ bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects();
assert((RS || !MayNeedScavengingEmergencySlot) &&
"RegScavenger required if spilling");
if (MayNeedScavengingEmergencySlot) {
- int ScavengeFI = MFI->CreateSpillStackObject(
+ int ScavengeFI = MFI.CreateStackObject(
AMDGPU::SGPR_32RegClass.getSize(),
- AMDGPU::SGPR_32RegClass.getAlignment());
+ AMDGPU::SGPR_32RegClass.getAlignment(), false);
RS->addScavengingFrameIndex(ScavengeFI);
}
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 37417d0..7657b4e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -14,12 +14,17 @@
namespace llvm {
+class SIInstrInfo;
+class SIMachineFunctionInfo;
+class SIRegisterInfo;
+class SISubtarget;
+
class SIFrameLowering final : public AMDGPUFrameLowering {
public:
SIFrameLowering(StackDirection D, unsigned StackAl, int LAO,
unsigned TransAl = 1) :
AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
- ~SIFrameLowering() override {}
+ ~SIFrameLowering() override = default;
void emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const override;
@@ -31,10 +36,29 @@ public:
RegScavenger *RS = nullptr) const override;
private:
+ void emitFlatScratchInit(const SIInstrInfo *TII,
+ const SIRegisterInfo* TRI,
+ MachineFunction &MF,
+ MachineBasicBlock &MBB) const;
+
+ unsigned getReservedPrivateSegmentBufferReg(
+ const SISubtarget &ST,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ SIMachineFunctionInfo *MFI,
+ MachineFunction &MF) const;
+
+ unsigned getReservedPrivateSegmentWaveByteOffsetReg(
+ const SISubtarget &ST,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ SIMachineFunctionInfo *MFI,
+ MachineFunction &MF) const;
+
/// \brief Emits debugger prologue.
void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
};
-}
+} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 80d4435..b98f9f4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -21,6 +21,7 @@
#include "AMDGPU.h"
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
#include "SIISelLowering.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
@@ -31,17 +32,18 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/Analysis.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
using namespace llvm;
-// -amdgpu-fast-fdiv - Command line option to enable faster 2.5 ulp fdiv.
-static cl::opt<bool> EnableAMDGPUFastFDIV(
- "amdgpu-fast-fdiv",
- cl::desc("Enable faster 2.5 ulp fdiv"),
+static cl::opt<bool> EnableVGPRIndexMode(
+ "amdgpu-vgpr-index-mode",
+ cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
cl::init(false));
+
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
@@ -58,7 +60,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
- addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
@@ -77,6 +79,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+ if (Subtarget->has16BitInsts()) {
+ addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
+ }
+
computeRegisterProperties(STI.getRegisterInfo());
// We need to custom lower vector stores from local memory
@@ -92,9 +99,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+ setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
+ setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
+
+
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
- setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
setOperationAction(ISD::SELECT, MVT::i1, Promote);
@@ -111,6 +129,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SETCC, MVT::i1, Promote);
setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
+ AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
@@ -159,6 +178,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
}
+ // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
+ // is expanded to avoid having two separate loops in case the index is a VGPR.
+
// Most operations are naturally 32-bit vector operations. We only support
// load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
@@ -218,6 +240,83 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FDIV, MVT::f32, Custom);
setOperationAction(ISD::FDIV, MVT::f64, Custom);
+ if (Subtarget->has16BitInsts()) {
+ setOperationAction(ISD::Constant, MVT::i16, Legal);
+
+ setOperationAction(ISD::SMIN, MVT::i16, Legal);
+ setOperationAction(ISD::SMAX, MVT::i16, Legal);
+
+ setOperationAction(ISD::UMIN, MVT::i16, Legal);
+ setOperationAction(ISD::UMAX, MVT::i16, Legal);
+
+ setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
+ AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
+
+ setOperationAction(ISD::ROTR, MVT::i16, Promote);
+ setOperationAction(ISD::ROTL, MVT::i16, Promote);
+
+ setOperationAction(ISD::SDIV, MVT::i16, Promote);
+ setOperationAction(ISD::UDIV, MVT::i16, Promote);
+ setOperationAction(ISD::SREM, MVT::i16, Promote);
+ setOperationAction(ISD::UREM, MVT::i16, Promote);
+
+ setOperationAction(ISD::BSWAP, MVT::i16, Promote);
+ setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
+
+ setOperationAction(ISD::CTTZ, MVT::i16, Promote);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
+ setOperationAction(ISD::CTLZ, MVT::i16, Promote);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
+
+ setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
+
+ setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+
+ setOperationAction(ISD::LOAD, MVT::i16, Custom);
+
+ setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+
+ setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
+ AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
+ setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
+ AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+
+ // F16 - Constant Actions.
+ setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
+
+ // F16 - Load/Store Actions.
+ setOperationAction(ISD::LOAD, MVT::f16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
+ setOperationAction(ISD::STORE, MVT::f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
+
+ // F16 - VOP1 Actions.
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ setOperationAction(ISD::FCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FSIN, MVT::f16, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
+
+ // F16 - VOP2 Actions.
+ setOperationAction(ISD::BR_CC, MVT::f16, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+ setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+ setOperationAction(ISD::FDIV, MVT::f16, Custom);
+
+ // F16 - VOP3 Actions.
+ setOperationAction(ISD::FMA, MVT::f16, Legal);
+ if (!Subtarget->hasFP16Denormals())
+ setOperationAction(ISD::FMAD, MVT::f16, Legal);
+ }
+
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
@@ -229,6 +328,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::FCANONICALIZE);
@@ -357,6 +458,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
case AMDGPUAS::CONSTANT_ADDRESS: {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
+ // FIXME: Can we get the real alignment here?
if (AM.BaseOffs % 4 != 0)
return isLegalMUBUFAddressingMode(AM);
@@ -435,8 +537,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
// which isn't a simple VT.
- if (!VT.isSimple() || VT == MVT::Other)
+ // Until MVT is extended to handle this, simply check for the size and
+ // rely on the condition below: allow accesses if the size is a multiple of 4.
+ if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
+ VT.getStoreSize() > 16)) {
return false;
+ }
if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
@@ -450,6 +556,15 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
return AlignedBy4;
}
+ // FIXME: We have to be conservative here and assume that flat operations
+ // will access scratch. If we had access to the IR function, then we
+ // could determine if any private memory was used in the function.
+ if (!Subtarget->hasUnalignedScratchAccess() &&
+ (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
+ AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
+ return false;
+ }
+
if (Subtarget->hasUnalignedBufferAccess()) {
// If we have an uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
@@ -496,8 +611,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
static bool isFlatGlobalAddrSpace(unsigned AS) {
return AS == AMDGPUAS::GLOBAL_ADDRESS ||
- AS == AMDGPUAS::FLAT_ADDRESS ||
- AS == AMDGPUAS::CONSTANT_ADDRESS;
+ AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS;
}
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -505,6 +620,23 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
}
+bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
+ const MemSDNode *MemNode = cast<MemSDNode>(N);
+ const Value *Ptr = MemNode->getMemOperand()->getValue();
+ const Instruction *I = dyn_cast<Instruction>(Ptr);
+ return I && I->getMetadata("amdgpu.noclobber");
+}
+
+bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ // Flat -> private/local is a simple truncate.
+ // Flat -> global is no-op
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
+ return true;
+
+ return isNoopAddrSpaceCast(SrcAS, DestAS);
+}
+
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
const Value *Ptr = MemNode->getMemOperand()->getValue();
@@ -531,11 +663,27 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const {
bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
- const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- return TII->isInlineConstant(Imm);
+ // FIXME: Could be smarter if called for vector constants.
+ return true;
}
bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
+ if (Subtarget->has16BitInsts() && VT == MVT::i16) {
+ switch (Op) {
+ case ISD::LOAD:
+ case ISD::STORE:
+
+ // These operations are done with 32-bit instructions anyway.
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::SELECT:
+ // TODO: Extensions?
+ return true;
+ default:
+ return false;
+ }
+ }
// SimplifySetCC uses this function to determine whether or not it should
// create setcc with i1 operands. We don't have instructions for i1 setcc.
@@ -560,26 +708,39 @@ SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
DAG.getConstant(Offset, SL, PtrVT));
}
+
SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
- unsigned Offset, bool Signed) const {
+ unsigned Offset, bool Signed,
+ const ISD::InputArg *Arg) const {
const DataLayout &DL = DAG.getDataLayout();
- Type *Ty = VT.getTypeForEVT(*DAG.getContext());
- MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+ Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
- SDValue PtrOffset = DAG.getUNDEF(PtrVT);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
unsigned Align = DL.getABITypeAlignment(Ty);
- ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
+ SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
+ MachineMemOperand::MONonTemporal |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+
+ SDValue Val = Load;
+ if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
+ VT.bitsLT(MemVT)) {
+ unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
+ Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
+ }
+
if (MemVT.isFloatingPoint())
- ExtTy = ISD::EXTLOAD;
+ Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
+ else if (Signed)
+ Val = DAG.getSExtOrTrunc(Val, SL, VT);
+ else
+ Val = DAG.getZExtOrTrunc(Val, SL, VT);
- SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
- return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset,
- PtrInfo, MemVT, Align, MachineMemOperand::MONonTemporal |
- MachineMemOperand::MOInvariant);
+ return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
}
SDValue SITargetLowering::LowerFormalArguments(
@@ -679,12 +840,9 @@ SDValue SITargetLowering::LowerFormalArguments(
}
if (!AMDGPU::isShader(CallConv)) {
- getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
- Splits);
-
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
} else {
- assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
+ assert(!Info->hasDispatchPtr() &&
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
!Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
@@ -692,6 +850,12 @@ SDValue SITargetLowering::LowerFormalArguments(
!Info->hasWorkItemIDZ());
}
+ if (Info->hasPrivateMemoryInputPtr()) {
+ unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI);
+ MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(PrivateMemoryPtrReg);
+ }
+
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
if (Info->hasPrivateSegmentBuffer()) {
unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
@@ -701,29 +865,38 @@ SDValue SITargetLowering::LowerFormalArguments(
if (Info->hasDispatchPtr()) {
unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
- MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
+ MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}
if (Info->hasQueuePtr()) {
unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
- MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass);
+ MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
}
if (Info->hasKernargSegmentPtr()) {
unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
- MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+ MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(InputPtrReg);
}
+ if (Info->hasDispatchID()) {
+ unsigned DispatchIDReg = Info->addDispatchID(*TRI);
+ MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
+ CCInfo.AllocateReg(DispatchIDReg);
+ }
+
if (Info->hasFlatScratchInit()) {
unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
- MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
+ MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
}
- AnalyzeFormalArguments(CCInfo, Splits);
+ if (!AMDGPU::isShader(CallConv))
+ analyzeFormalArgumentsCompute(CCInfo, Ins);
+ else
+ AnalyzeFormalArguments(CCInfo, Splits);
SmallVector<SDValue, 16> Chains;
@@ -740,13 +913,14 @@ SDValue SITargetLowering::LowerFormalArguments(
if (VA.isMemLoc()) {
VT = Ins[i].VT;
- EVT MemVT = Splits[i].VT;
- const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
+ EVT MemVT = VA.getLocVT();
+ const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) +
VA.getLocMemOffset();
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain,
- Offset, Ins[i].Flags.isSExt());
+ Offset, Ins[i].Flags.isSExt(),
+ &Ins[i]);
Chains.push_back(Arg.getValue(1));
auto *ParamTy =
@@ -761,7 +935,7 @@ SDValue SITargetLowering::LowerFormalArguments(
}
InVals.push_back(Arg);
- Info->ABIArgOffset = Offset + MemVT.getStoreSize();
+ Info->setABIArgOffset(Offset + MemVT.getStoreSize());
continue;
}
assert(VA.isRegLoc() && "Parameter must be in a register!");
@@ -771,8 +945,8 @@ SDValue SITargetLowering::LowerFormalArguments(
if (VT == MVT::i64) {
// For now assume it is a pointer
Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
- &AMDGPU::SReg_64RegClass);
- Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
+ &AMDGPU::SGPR_64RegClass);
+ Reg = MF.addLiveIn(Reg, &AMDGPU::SGPR_64RegClass);
SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
InVals.push_back(Copy);
continue;
@@ -816,25 +990,25 @@ SDValue SITargetLowering::LowerFormalArguments(
// Start adding system SGPRs.
if (Info->hasWorkGroupIDX()) {
unsigned Reg = Info->addWorkGroupIDX();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info->hasWorkGroupIDY()) {
unsigned Reg = Info->addWorkGroupIDY();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info->hasWorkGroupIDZ()) {
unsigned Reg = Info->addWorkGroupIDZ();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info->hasWorkGroupInfo()) {
unsigned Reg = Info->addWorkGroupInfo();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
CCInfo.AllocateReg(Reg);
}
@@ -854,18 +1028,22 @@ SDValue SITargetLowering::LowerFormalArguments(
// Now that we've figured out where the scratch register inputs are, see if
// should reserve the arguments and use them directly.
- bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+ bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
// Record that we know we have non-spill stack objects so we don't need to
// check all stack objects later.
if (HasStackObjects)
Info->setHasNonSpillStackObjects(true);
- if (ST.isAmdHsaOS()) {
- // TODO: Assume we will spill without optimizations.
+ // Everything live out of a block is spilled with fast regalloc, so it's
+ // almost certain that spilling will be required.
+ if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+ HasStackObjects = true;
+
+ if (ST.isAmdCodeObjectV2(MF)) {
if (HasStackObjects) {
// If we have stack objects, we unquestionably need the private buffer
- // resource. For the HSA ABI, this will be the first 4 user SGPR
- // inputs. We can reserve those and use them directly.
+ // resource. For the Code Object V2 ABI, this will be the first 4 user
+ // SGPR inputs. We can reserve those and use them directly.
unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
@@ -1088,64 +1266,551 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
MachineBasicBlock *SplitBB
= MF->CreateMachineBasicBlock(BB->getBasicBlock());
- // Fix the block phi references to point to the new block for the defs in the
- // second piece of the block.
- for (MachineBasicBlock *Succ : BB->successors()) {
- for (MachineInstr &MI : *Succ) {
- if (!MI.isPHI())
- break;
-
- for (unsigned I = 2, E = MI.getNumOperands(); I != E; I += 2) {
- MachineOperand &FromBB = MI.getOperand(I);
- if (BB == FromBB.getMBB()) {
- FromBB.setMBB(SplitBB);
- break;
- }
- }
- }
- }
-
MF->insert(++MachineFunction::iterator(BB), SplitBB);
SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
- SplitBB->transferSuccessors(BB);
+ SplitBB->transferSuccessorsAndUpdatePHIs(BB);
BB->addSuccessor(SplitBB);
MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
return SplitBB;
}
+// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
+// wavefront. If the value is uniform and just happens to be in a VGPR, this
+// will only do one iteration. In the worst case, this will loop 64 times.
+//
+// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
+static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
+ const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI,
+ MachineBasicBlock &OrigBB,
+ MachineBasicBlock &LoopBB,
+ const DebugLoc &DL,
+ const MachineOperand &IdxReg,
+ unsigned InitReg,
+ unsigned ResultReg,
+ unsigned PhiReg,
+ unsigned InitSaveExecReg,
+ int Offset,
+ bool UseGPRIdxMode) {
+ MachineBasicBlock::iterator I = LoopBB.begin();
+
+ unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
+ .addReg(InitReg)
+ .addMBB(&OrigBB)
+ .addReg(ResultReg)
+ .addMBB(&LoopBB);
+
+ BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
+ .addReg(InitSaveExecReg)
+ .addMBB(&OrigBB)
+ .addReg(NewExec)
+ .addMBB(&LoopBB);
+
+ // Read the next variant <- also loop target.
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
+ .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+
+ // Compare the just read M0 value to all possible Idx values.
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
+ .addReg(CurrentIdxReg)
+ .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
+
+ if (UseGPRIdxMode) {
+ unsigned IdxReg;
+ if (Offset == 0) {
+ IdxReg = CurrentIdxReg;
+ } else {
+ IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
+ .addReg(CurrentIdxReg, RegState::Kill)
+ .addImm(Offset);
+ }
+
+ MachineInstr *SetIdx =
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
+ .addReg(IdxReg, RegState::Kill);
+ SetIdx->getOperand(2).setIsUndef();
+ } else {
+ // Move index from VCC into M0
+ if (Offset == 0) {
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(CurrentIdxReg, RegState::Kill);
+ } else {
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ .addReg(CurrentIdxReg, RegState::Kill)
+ .addImm(Offset);
+ }
+ }
+
+ // Update EXEC, save the original EXEC value to VCC.
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
+ .addReg(CondReg, RegState::Kill);
+
+ MRI.setSimpleHint(NewExec, CondReg);
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ MachineInstr *InsertPt =
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(NewExec);
+
+ // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
+ // s_cbranch_scc0?
+
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addMBB(&LoopBB);
+
+ return InsertPt->getIterator();
+}
+
+// This has slightly sub-optimal regalloc when the source vector is killed by
+// the read. The register allocator does not understand that the kill is
+// per-workitem, so is kept alive for the whole loop so we end up not re-using a
+// subregister from it, using 1 more VGPR than necessary. This was saved when
+// this was expanded after register allocation.
+static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
+ MachineBasicBlock &MBB,
+ MachineInstr &MI,
+ unsigned InitResultReg,
+ unsigned PhiReg,
+ int Offset,
+ bool UseGPRIdxMode) {
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I(&MI);
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
+
+ // Save the EXEC mask
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
+ .addReg(AMDGPU::EXEC);
+
+ // To insert the loop we need to split the block. Move everything after this
+ // point to a new block, and insert a new empty block between the two.
+ MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF->insert(MBBI, LoopBB);
+ MF->insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(LoopBB);
+ LoopBB->addSuccessor(RemainderBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+
+ MBB.addSuccessor(LoopBB);
+
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+
+ auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
+ InitResultReg, DstReg, PhiReg, TmpExec,
+ Offset, UseGPRIdxMode);
+
+ MachineBasicBlock::iterator First = RemainderBB->begin();
+ BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addReg(SaveExec);
+
+ return InsPt;
+}
+
+// Returns subreg index, offset
+static std::pair<unsigned, int>
+computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
+ const TargetRegisterClass *SuperRC,
+ unsigned VecReg,
+ int Offset) {
+ int NumElts = SuperRC->getSize() / 4;
+
+ // Skip out of bounds offsets, or else we would end up using an undefined
+ // register.
+ if (Offset >= NumElts || Offset < 0)
+ return std::make_pair(AMDGPU::sub0, Offset);
+
+ return std::make_pair(AMDGPU::sub0 + Offset, 0);
+}
+
+// Return true if the index is an SGPR and was set.
+static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI,
+ MachineInstr &MI,
+ int Offset,
+ bool UseGPRIdxMode,
+ bool IsIndirectSrc) {
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I(&MI);
+
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+ const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
+
+ assert(Idx->getReg() != AMDGPU::NoRegister);
+
+ if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
+ return false;
+
+ if (UseGPRIdxMode) {
+ unsigned IdxMode = IsIndirectSrc ?
+ VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+ if (Offset == 0) {
+ MachineInstr *SetOn =
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addOperand(*Idx)
+ .addImm(IdxMode);
+
+ SetOn->getOperand(3).setIsUndef();
+ } else {
+ unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
+ .addOperand(*Idx)
+ .addImm(Offset);
+ MachineInstr *SetOn =
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addReg(Tmp, RegState::Kill)
+ .addImm(IdxMode);
+
+ SetOn->getOperand(3).setIsUndef();
+ }
+
+ return true;
+ }
+
+ if (Offset == 0) {
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addOperand(*Idx);
+ } else {
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ .addOperand(*Idx)
+ .addImm(Offset);
+ }
+
+ return true;
+}
+
+// Control flow needs to be inserted if indexing with a VGPR.
+static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ const SISubtarget &ST) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
+ int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
+
+ const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
+
+ unsigned SubReg;
+ std::tie(SubReg, Offset)
+ = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
+
+ bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+
+ if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
+ MachineBasicBlock::iterator I(&MI);
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (UseGPRIdxMode) {
+ // TODO: Look at the uses to avoid the copy. This may require rescheduling
+ // to avoid interfering with other uses, so probably requires a new
+ // optimization pass.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+ .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, RegState::Implicit)
+ .addReg(AMDGPU::M0, RegState::Implicit);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ } else {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+ .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, RegState::Implicit);
+ }
+
+ MI.eraseFromParent();
+
+ return &MBB;
+ }
+
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I(&MI);
+
+ unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
+
+ if (UseGPRIdxMode) {
+ MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addImm(0) // Reset inside loop.
+ .addImm(VGPRIndexMode::SRC0_ENABLE);
+ SetOn->getOperand(3).setIsUndef();
+
+ // Disable again after the loop.
+ BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ }
+
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
+ MachineBasicBlock *LoopBB = InsPt->getParent();
+
+ if (UseGPRIdxMode) {
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+ .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, RegState::Implicit)
+ .addReg(AMDGPU::M0, RegState::Implicit);
+ } else {
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+ .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, RegState::Implicit);
+ }
+
+ MI.eraseFromParent();
+
+ return LoopBB;
+}
+
+static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) {
+ switch (VecRC->getSize()) {
+ case 4:
+ return AMDGPU::V_MOVRELD_B32_V1;
+ case 8:
+ return AMDGPU::V_MOVRELD_B32_V2;
+ case 16:
+ return AMDGPU::V_MOVRELD_B32_V4;
+ case 32:
+ return AMDGPU::V_MOVRELD_B32_V8;
+ case 64:
+ return AMDGPU::V_MOVRELD_B32_V16;
+ default:
+ llvm_unreachable("unsupported size for MOVRELD pseudos");
+ }
+}
+
+static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ const SISubtarget &ST) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ unsigned Dst = MI.getOperand(0).getReg();
+ const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+ const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
+ int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
+ const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
+
+ // This can be an immediate, but will be folded later.
+ assert(Val->getReg());
+
+ unsigned SubReg;
+ std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
+ SrcVec->getReg(),
+ Offset);
+ bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+
+ if (Idx->getReg() == AMDGPU::NoRegister) {
+ MachineBasicBlock::iterator I(&MI);
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ assert(Offset == 0);
+
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
+ .addOperand(*SrcVec)
+ .addOperand(*Val)
+ .addImm(SubReg);
+
+ MI.eraseFromParent();
+ return &MBB;
+ }
+
+ if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
+ MachineBasicBlock::iterator I(&MI);
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (UseGPRIdxMode) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
+ .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
+ .addOperand(*Val)
+ .addReg(Dst, RegState::ImplicitDefine)
+ .addReg(SrcVec->getReg(), RegState::Implicit)
+ .addReg(AMDGPU::M0, RegState::Implicit);
+
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ } else {
+ const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
+
+ BuildMI(MBB, I, DL, MovRelDesc)
+ .addReg(Dst, RegState::Define)
+ .addReg(SrcVec->getReg())
+ .addOperand(*Val)
+ .addImm(SubReg - AMDGPU::sub0);
+ }
+
+ MI.eraseFromParent();
+ return &MBB;
+ }
+
+ if (Val->isReg())
+ MRI.clearKillFlags(Val->getReg());
+
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (UseGPRIdxMode) {
+ MachineBasicBlock::iterator I(&MI);
+
+ MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addImm(0) // Reset inside loop.
+ .addImm(VGPRIndexMode::DST_ENABLE);
+ SetOn->getOperand(3).setIsUndef();
+
+ // Disable again after the loop.
+ BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ }
+
+ unsigned PhiReg = MRI.createVirtualRegister(VecRC);
+
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
+ Offset, UseGPRIdxMode);
+ MachineBasicBlock *LoopBB = InsPt->getParent();
+
+ if (UseGPRIdxMode) {
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
+ .addReg(PhiReg, RegState::Undef, SubReg) // vdst
+ .addOperand(*Val) // src0
+ .addReg(Dst, RegState::ImplicitDefine)
+ .addReg(PhiReg, RegState::Implicit)
+ .addReg(AMDGPU::M0, RegState::Implicit);
+ } else {
+ const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
+
+ BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
+ .addReg(Dst, RegState::Define)
+ .addReg(PhiReg)
+ .addOperand(*Val)
+ .addImm(SubReg - AMDGPU::sub0);
+ }
+
+ MI.eraseFromParent();
+
+ return LoopBB;
+}
+
MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
+
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ MachineFunction *MF = BB->getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+ if (TII->isMIMG(MI)) {
+ if (!MI.memoperands_empty())
+ return BB;
+ // Add a memoperand for mimg instructions so that they aren't assumed to
+ // be ordered memory instuctions.
+
+ MachinePointerInfo PtrInfo(MFI->getImagePSV());
+ MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable;
+ if (MI.mayStore())
+ Flags |= MachineMemOperand::MOStore;
+
+ if (MI.mayLoad())
+ Flags |= MachineMemOperand::MOLoad;
+
+ auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
+ MI.addMemOperand(*MF, MMO);
+ return BB;
+ }
+
switch (MI.getOpcode()) {
case AMDGPU::SI_INIT_M0: {
- const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addOperand(MI.getOperand(0));
+ .addOperand(MI.getOperand(0));
MI.eraseFromParent();
- break;
- }
- case AMDGPU::BRANCH:
return BB;
+ }
case AMDGPU::GET_GROUPSTATICSIZE: {
- const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
-
- MachineFunction *MF = BB->getParent();
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
DebugLoc DL = MI.getDebugLoc();
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
.addOperand(MI.getOperand(0))
- .addImm(MFI->LDSSize);
+ .addImm(MFI->getLDSSize());
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::SI_INDIRECT_SRC_V1:
+ case AMDGPU::SI_INDIRECT_SRC_V2:
+ case AMDGPU::SI_INDIRECT_SRC_V4:
+ case AMDGPU::SI_INDIRECT_SRC_V8:
+ case AMDGPU::SI_INDIRECT_SRC_V16:
+ return emitIndirectSrc(MI, *BB, *getSubtarget());
+ case AMDGPU::SI_INDIRECT_DST_V1:
+ case AMDGPU::SI_INDIRECT_DST_V2:
+ case AMDGPU::SI_INDIRECT_DST_V4:
+ case AMDGPU::SI_INDIRECT_DST_V8:
+ case AMDGPU::SI_INDIRECT_DST_V16:
+ return emitIndirectDst(MI, *BB, *getSubtarget());
case AMDGPU::SI_KILL:
return splitKillBlock(MI, BB);
+ case AMDGPU::V_CNDMASK_B64_PSEUDO: {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned Src0 = MI.getOperand(1).getReg();
+ unsigned Src1 = MI.getOperand(2).getReg();
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned SrcCond = MI.getOperand(3).getReg();
+
+ unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
+ .addReg(Src0, 0, AMDGPU::sub0)
+ .addReg(Src1, 0, AMDGPU::sub0)
+ .addReg(SrcCond);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
+ .addReg(Src0, 0, AMDGPU::sub1)
+ .addReg(Src1, 0, AMDGPU::sub1)
+ .addReg(SrcCond);
+
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
+ .addReg(DstLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(DstHi)
+ .addImm(AMDGPU::sub1);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::SI_BR_UNDEF: {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ .addOperand(MI.getOperand(0));
+ Br->getOperand(1).setIsUndef(true); // read undef SCC
+ MI.eraseFromParent();
+ return BB;
+ }
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
- return BB;
}
bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
@@ -1167,8 +1832,10 @@ EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
}
-MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const {
- return MVT::i32;
+MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
+ // TODO: Should i16 be used always if legal? For now it would force VALU
+ // shifts.
+ return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
}
// Answering this is somewhat tricky and depends on the specific device which
@@ -1201,6 +1868,8 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
case MVT::f64:
return true;
+ case MVT::f16:
+ return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
default:
break;
}
@@ -1215,7 +1884,6 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
- case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::LOAD: {
SDValue Result = LowerLOAD(Op, DAG);
@@ -1242,6 +1910,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
case ISD::TRAP: return lowerTRAP(Op, DAG);
+ case ISD::FP_ROUND:
+ return lowerFP_ROUND(Op, DAG);
}
return SDValue();
}
@@ -1262,58 +1932,31 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
return nullptr;
}
-SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
-
- SDLoc SL(Op);
- FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
- unsigned FrameIndex = FINode->getIndex();
-
- // A FrameIndex node represents a 32-bit offset into scratch memory. If the
- // high bit of a frame index offset were to be set, this would mean that it
- // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
- // buffer, with 64 being the number of threads per wave.
- //
- // The maximum private allocation for the entire GPU is 4G, and we are
- // concerned with the largest the index could ever be for an individual
- // workitem. This will occur with the minmum dispatch size. If a program
- // requires more, the dispatch size will be reduced.
- //
- // With this limit, we can mark the high bit of the FrameIndex node as known
- // zero, which is important, because it means in most situations we can prove
- // that values derived from FrameIndex nodes are non-negative. This enables us
- // to take advantage of more addressing modes when accessing scratch buffers,
- // since for scratch reads/writes, the register offset must always be
- // positive.
-
- uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
-
- // XXX - It is unclear if partial dispatch works. Assume it works at half wave
- // granularity. It is probably a full wave.
- uint64_t MinGranularity = 32;
-
- unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
- EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
-
- SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
- return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
- DAG.getValueType(ExtVT));
-}
-
bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
- if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
- return false;
+ if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+ switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
+ case AMDGPUIntrinsic::amdgcn_if:
+ case AMDGPUIntrinsic::amdgcn_else:
+ case AMDGPUIntrinsic::amdgcn_end_cf:
+ case AMDGPUIntrinsic::amdgcn_loop:
+ return true;
+ default:
+ return false;
+ }
+ }
- switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
- default: return false;
- case AMDGPUIntrinsic::amdgcn_if:
- case AMDGPUIntrinsic::amdgcn_else:
- case AMDGPUIntrinsic::amdgcn_break:
- case AMDGPUIntrinsic::amdgcn_if_break:
- case AMDGPUIntrinsic::amdgcn_else_break:
- case AMDGPUIntrinsic::amdgcn_loop:
- case AMDGPUIntrinsic::amdgcn_end_cf:
- return true;
+ if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+ switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) {
+ case AMDGPUIntrinsic::amdgcn_break:
+ case AMDGPUIntrinsic::amdgcn_if_break:
+ case AMDGPUIntrinsic::amdgcn_else_break:
+ return true;
+ default:
+ return false;
+ }
}
+
+ return false;
}
void SITargetLowering::createDebuggerPrologueStackObjects(
@@ -1334,14 +1977,31 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
// For each dimension:
for (unsigned i = 0; i < 3; ++i) {
// Create fixed stack object for work group ID.
- ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true);
+ ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
// Create fixed stack object for work item ID.
- ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true);
+ ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
}
}
+bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
+ const Triple &TT = getTargetMachine().getTargetTriple();
+ return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ AMDGPU::shouldEmitConstantsToTextSection(TT);
+}
+
+bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
+ return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
+ !shouldEmitFixup(GV) &&
+ !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+}
+
+bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
+ return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
+}
+
/// This transforms the control flow intrinsics to get the branch destination as
/// last parameter, also switches branch target with BR if the need arise
SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
@@ -1365,30 +2025,50 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
Target = BR->getOperand(1);
}
+ // FIXME: This changes the types of the intrinsics instead of introducing new
+ // nodes with the correct types.
+ // e.g. llvm.amdgcn.loop
+
+ // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
+ // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
+
if (!isCFIntrinsic(Intr)) {
// This is a uniform branch so we don't need to legalize.
return BRCOND;
}
+ bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
+ Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
+
assert(!SetCC ||
(SetCC->getConstantOperandVal(1) == 1 &&
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
ISD::SETNE));
- // Build the result and
- ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
-
// operands of the new intrinsic call
SmallVector<SDValue, 4> Ops;
- Ops.push_back(BRCOND.getOperand(0));
- Ops.append(Intr->op_begin() + 1, Intr->op_end());
+ if (HaveChain)
+ Ops.push_back(BRCOND.getOperand(0));
+
+ Ops.append(Intr->op_begin() + (HaveChain ? 1 : 0), Intr->op_end());
Ops.push_back(Target);
+ ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
+
// build the new intrinsic call
SDNode *Result = DAG.getNode(
Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
DAG.getVTList(Res), Ops).getNode();
+ if (!HaveChain) {
+ SDValue Ops[] = {
+ SDValue(Result, 0),
+ BRCOND.getOperand(0)
+ };
+
+ Result = DAG.getMergeValues(Ops, DL).getNode();
+ }
+
if (BR) {
// Give the branch instruction our target
SDValue Ops[] = {
@@ -1425,6 +2105,31 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
return Chain;
}
+SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
+ SDValue Op,
+ const SDLoc &DL,
+ EVT VT) const {
+ return Op.getValueType().bitsLE(VT) ?
+ DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
+ DAG.getNode(ISD::FTRUNC, DL, VT, Op);
+}
+
+SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getValueType() == MVT::f16 &&
+ "Do not know how to custom lower FP_ROUND for non-f16 type");
+
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT != MVT::f64)
+ return Op;
+
+ SDLoc DL(Op);
+
+ SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
+ return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);;
+}
+
SDValue SITargetLowering::getSegmentAperture(unsigned AS,
SelectionDAG &DAG) const {
SDLoc SL;
@@ -1452,7 +2157,8 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS,
MachinePointerInfo PtrInfo(V, StructOffset);
return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
MinAlign(64, StructOffset),
- MachineMemOperand::MOInvariant);
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
}
SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
@@ -1505,17 +2211,12 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
return DAG.getUNDEF(ASC->getValueType(0));
}
-static bool shouldEmitGOTReloc(const GlobalValue *GV,
- const TargetMachine &TM) {
- return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
- !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
-}
-
bool
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// We can fold offsets for anything that doesn't require a GOT relocation.
- return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
- !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine());
+ return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
+ !shouldEmitGOTReloc(GA->getGlobal());
}
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
@@ -1523,14 +2224,27 @@ static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
unsigned GAFlags = SIInstrInfo::MO_NONE) {
// In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
// lowered to the following code sequence:
- // s_getpc_b64 s[0:1]
- // s_add_u32 s0, s0, $symbol
- // s_addc_u32 s1, s1, 0
//
- // s_getpc_b64 returns the address of the s_add_u32 instruction and then
- // a fixup or relocation is emitted to replace $symbol with a literal
- // constant, which is a pc-relative offset from the encoding of the $symbol
- // operand to the global variable.
+ // For constant address space:
+ // s_getpc_b64 s[0:1]
+ // s_add_u32 s0, s0, $symbol
+ // s_addc_u32 s1, s1, 0
+ //
+ // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+ // a fixup or relocation is emitted to replace $symbol with a literal
+ // constant, which is a pc-relative offset from the encoding of the $symbol
+ // operand to the global variable.
+ //
+ // For global address space:
+ // s_getpc_b64 s[0:1]
+ // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
+ // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
+ //
+ // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+ // fixups or relocations are emitted to replace $symbol@*@lo and
+ // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
+ // which is a 64-bit pc-relative offset from the encoding of the $symbol
+ // operand to the global variable.
//
// What we want here is an offset from the value returned by s_getpc
// (which is the address of the s_add_u32 instruction) to the global
@@ -1538,9 +2252,12 @@ static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
// small. This requires us to add 4 to the global variable offset in order to
// compute the correct address.
- SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
- GAFlags);
- return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA);
+ SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
+ GAFlags);
+ SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
+ GAFlags == SIInstrInfo::MO_NONE ?
+ GAFlags : GAFlags + 1);
+ return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
}
SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
@@ -1556,11 +2273,14 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
const GlobalValue *GV = GSD->getGlobal();
EVT PtrVT = Op.getValueType();
- if (!shouldEmitGOTReloc(GV, getTargetMachine()))
+ if (shouldEmitFixup(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
+ else if (shouldEmitPCReloc(GV))
+ return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
+ SIInstrInfo::MO_REL32);
SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
- SIInstrInfo::MO_GOTPCREL);
+ SIInstrInfo::MO_GOTPCREL32);
Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
@@ -1570,7 +2290,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
- MachineMemOperand::MOInvariant);
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
}
SDValue SITargetLowering::lowerTRAP(SDValue Op,
@@ -1647,9 +2368,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// TODO: Should this propagate fast-math-flags?
switch (IntrinsicID) {
+ case Intrinsic::amdgcn_implicit_buffer_ptr: {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ }
case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
- if (!Subtarget->isAmdHsaOS()) {
+ if (!Subtarget->isAmdCodeObjectV2(MF)) {
DiagnosticInfoUnsupported BadIntrin(
*MF.getFunction(), "unsupported hsa intrinsic without hsa target",
DL.getDebugLoc());
@@ -1671,6 +2396,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
= TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
}
+ case Intrinsic::amdgcn_dispatch_id: {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID);
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ }
case Intrinsic::amdgcn_rcp:
return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_rsq:
@@ -1682,6 +2411,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
}
+ case Intrinsic::amdgcn_rcp_legacy: {
+ if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ return emitRemovedIntrinsicError(DAG, DL, VT);
+ return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
+ }
case Intrinsic::amdgcn_rsq_clamp: {
if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
@@ -1750,22 +2484,17 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return lowerImplicitZextParam(DAG, Op, MVT::i16,
SI::KernelInputOffsets::LOCAL_SIZE_Z);
- case Intrinsic::amdgcn_read_workdim:
- case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name.
- // Really only 2 bits.
- return lowerImplicitZextParam(DAG, Op, MVT::i8,
- getImplicitParameterOffset(MFI, GRID_DIM));
case Intrinsic::amdgcn_workgroup_id_x:
case Intrinsic::r600_read_tgid_x:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
case Intrinsic::amdgcn_workgroup_id_y:
case Intrinsic::r600_read_tgid_y:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
case Intrinsic::amdgcn_workgroup_id_z:
case Intrinsic::r600_read_tgid_z:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::r600_read_tidig_x:
@@ -1786,9 +2515,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
};
MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
- VT.getStoreSize(), 4);
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ VT.getStoreSize(), 4);
return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
Op->getVTList(), Ops, VT, MMO);
}
@@ -1818,6 +2548,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DAG.getConstant(0, DL, MVT::i32));
SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
DAG.getConstant(1, DL, MVT::i32));
+ I = DAG.getNode(ISD::BITCAST, DL, MVT::f32, I);
+ J = DAG.getNode(ISD::BITCAST, DL, MVT::f32, J);
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
SDValue Glue = M0.getValue(1);
SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
@@ -1827,6 +2559,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
Op.getOperand(1), Op.getOperand(2), Glue);
}
+ case Intrinsic::amdgcn_interp_mov: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
+ SDValue Glue = M0.getValue(1);
+ return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3), Glue);
+ }
case Intrinsic::amdgcn_interp_p1: {
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
SDValue Glue = M0.getValue(1);
@@ -1899,6 +2637,38 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
Denominator, Numerator);
}
+ case Intrinsic::amdgcn_icmp: {
+ const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+ int CondCode = CD->getSExtValue();
+
+ if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
+ CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE)
+ return DAG.getUNDEF(VT);
+
+ ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
+ ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
+ return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
+ Op.getOperand(2), DAG.getCondCode(CCOpcode));
+ }
+ case Intrinsic::amdgcn_fcmp: {
+ const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+ int CondCode = CD->getSExtValue();
+
+ if (CondCode <= FCmpInst::Predicate::FCMP_FALSE ||
+ CondCode >= FCmpInst::Predicate::FCMP_TRUE)
+ return DAG.getUNDEF(VT);
+
+ FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
+ ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
+ return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
+ Op.getOperand(2), DAG.getCondCode(CCOpcode));
+ }
+ case Intrinsic::amdgcn_fmul_legacy:
+ return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::amdgcn_sffbh:
+ case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name.
+ return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
default:
return AMDGPUTargetLowering::LowerOperation(Op, DAG);
}
@@ -1907,6 +2677,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ SDLoc DL(Op);
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec: {
@@ -1922,6 +2693,31 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
+ case Intrinsic::amdgcn_buffer_load:
+ case Intrinsic::amdgcn_buffer_load_format: {
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ Op.getOperand(3), // vindex
+ Op.getOperand(4), // offset
+ Op.getOperand(5), // glc
+ Op.getOperand(6) // slc
+ };
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
+ AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
+ EVT VT = Op.getValueType();
+ EVT IntVT = VT.changeTypeToInteger();
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(MFI->getBufferPSV()),
+ MachineMemOperand::MOLoad,
+ VT.getStoreSize(), VT.getStoreSize());
+
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
+ }
default:
return SDValue();
}
@@ -1935,12 +2731,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
switch (IntrinsicID) {
- case AMDGPUIntrinsic::SI_sendmsg: {
+ case AMDGPUIntrinsic::SI_sendmsg:
+ case Intrinsic::amdgcn_s_sendmsg: {
Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
SDValue Glue = Chain.getValue(1);
return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
Op.getOperand(2), Glue);
}
+ case Intrinsic::amdgcn_s_sendmsghalt: {
+ Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+ SDValue Glue = Chain.getValue(1);
+ return DAG.getNode(AMDGPUISD::SENDMSGHALT, DL, MVT::Other, Chain,
+ Op.getOperand(2), Glue);
+ }
case AMDGPUIntrinsic::SI_tbuffer_store: {
SDValue Ops[] = {
Chain,
@@ -1969,12 +2772,40 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op->getVTList(), Ops, VT, MMO);
}
case AMDGPUIntrinsic::AMDGPU_kill: {
- if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Op.getOperand(2))) {
+ SDValue Src = Op.getOperand(2);
+ if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
if (!K->isNegative())
return Chain;
+
+ SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
+ return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
}
- return Op;
+ SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
+ return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
+ }
+ case AMDGPUIntrinsic::SI_export: {
+ const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2));
+ const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3));
+ const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4));
+ const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5));
+ const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6));
+
+ const SDValue Ops[] = {
+ Chain,
+ DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),
+ DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1),
+ DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8),
+ DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1),
+ Op.getOperand(7), // src0
+ Op.getOperand(8), // src1
+ Op.getOperand(9), // src2
+ Op.getOperand(10) // src3
+ };
+
+ unsigned Opc = Done->isNullValue() ?
+ AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
+ return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
}
default:
return SDValue();
@@ -1988,7 +2819,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
EVT MemVT = Load->getMemoryVT();
if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
- assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
// FIXME: Copied from PPC
// First, load into 32 bits, then truncate to 1 bit.
@@ -1996,8 +2826,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue BasePtr = Load->getBasePtr();
MachineMemOperand *MMO = Load->getMemOperand();
+ EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
+
SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
- BasePtr, MVT::i8, MMO);
+ BasePtr, RealMemVT, MMO);
SDValue Ops[] = {
DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
@@ -2021,17 +2853,34 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMergeValues(Ops, DL);
}
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ // If there is a possibilty that flat instruction access scratch memory
+ // then we need to use the same legalization rules we use for private.
+ if (AS == AMDGPUAS::FLAT_ADDRESS)
+ AS = MFI->hasFlatScratchInit() ?
+ AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
+
unsigned NumElements = MemVT.getVectorNumElements();
switch (AS) {
case AMDGPUAS::CONSTANT_ADDRESS:
if (isMemOpUniform(Load))
return SDValue();
// Non-uniform loads will be selected to MUBUF instructions, so they
- // have the same legalization requires ments as global and private
+ // have the same legalization requirements as global and private
// loads.
//
- // Fall-through
- case AMDGPUAS::GLOBAL_ADDRESS:
+ LLVM_FALLTHROUGH;
+ case AMDGPUAS::GLOBAL_ADDRESS: {
+ if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
+ isMemOpHasNoClobberedMemOperand(Load))
+ return SDValue();
+ // Non-uniform loads will be selected to MUBUF instructions, so they
+ // have the same legalization requirements as global and private
+ // loads.
+ //
+ }
+ LLVM_FALLTHROUGH;
case AMDGPUAS::FLAT_ADDRESS:
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
@@ -2110,22 +2959,33 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
- if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) &&
- CLHS->isExactlyValue(1.0)) {
- // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
- // the CI documentation has a worst case error of 1 ulp.
- // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
- // use it as long as we aren't trying to use denormals.
-
- // 1.0 / sqrt(x) -> rsq(x)
- //
- // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
- // error seems really high at 2^29 ULP.
- if (RHS.getOpcode() == ISD::FSQRT)
- return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
-
- // 1.0 / x -> rcp(x)
- return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+ if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
+ VT == MVT::f16) {
+ if (CLHS->isExactlyValue(1.0)) {
+ // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+ // the CI documentation has a worst case error of 1 ulp.
+ // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+ // use it as long as we aren't trying to use denormals.
+ //
+ // v_rcp_f16 and v_rsq_f16 DO support denormals.
+
+ // 1.0 / sqrt(x) -> rsq(x)
+
+ // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+ // error seems really high at 2^29 ULP.
+ if (RHS.getOpcode() == ISD::FSQRT)
+ return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
+
+ // 1.0 / x -> rcp(x)
+ return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+ }
+
+ // Same as for 1.0, but expand the sign out of the constant.
+ if (CLHS->isExactlyValue(-1.0)) {
+ // -1.0 / x -> rcp (fneg x)
+ SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
+ }
}
}
@@ -2143,6 +3003,67 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
return SDValue();
}
+static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
+ EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
+ if (GlueChain->getNumValues() <= 1) {
+ return DAG.getNode(Opcode, SL, VT, A, B);
+ }
+
+ assert(GlueChain->getNumValues() == 3);
+
+ SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+ switch (Opcode) {
+ default: llvm_unreachable("no chain equivalent for opcode");
+ case ISD::FMUL:
+ Opcode = AMDGPUISD::FMUL_W_CHAIN;
+ break;
+ }
+
+ return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
+ GlueChain.getValue(2));
+}
+
+static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
+ EVT VT, SDValue A, SDValue B, SDValue C,
+ SDValue GlueChain) {
+ if (GlueChain->getNumValues() <= 1) {
+ return DAG.getNode(Opcode, SL, VT, A, B, C);
+ }
+
+ assert(GlueChain->getNumValues() == 3);
+
+ SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+ switch (Opcode) {
+ default: llvm_unreachable("no chain equivalent for opcode");
+ case ISD::FMA:
+ Opcode = AMDGPUISD::FMA_W_CHAIN;
+ break;
+ }
+
+ return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
+ GlueChain.getValue(2));
+}
+
+SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
+ if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
+ return FastLowered;
+
+ SDLoc SL(Op);
+ SDValue Src0 = Op.getOperand(0);
+ SDValue Src1 = Op.getOperand(1);
+
+ SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
+ SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
+
+ SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
+ SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
+
+ SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
+ SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
+
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
+}
+
// Faster 2.5 ULP division that does not support denormals.
SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
@@ -2189,25 +3110,73 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
- SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
- SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
+ SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
+ RHS, RHS, LHS);
+ SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
+ LHS, RHS, LHS);
// Denominator is scaled to not be denormal, so using rcp is ok.
- SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
+ SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
+ DenominatorScaled);
+ SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
+ DenominatorScaled);
+
+ const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
+ (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+ (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+
+ const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
+
+ if (!Subtarget->hasFP32Denormals()) {
+ SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
+ SL, MVT::i32);
+ SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
+ DAG.getEntryNode(),
+ EnableDenormValue, BitField);
+ SDValue Ops[3] = {
+ NegDivScale0,
+ EnableDenorm.getValue(0),
+ EnableDenorm.getValue(1)
+ };
+
+ NegDivScale0 = DAG.getMergeValues(Ops, SL);
+ }
- SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
+ SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
+ ApproxRcp, One, NegDivScale0);
- SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
- SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
+ SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
+ ApproxRcp, Fma0);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
+ SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
+ Fma1, Fma1);
- SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
- SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
- SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
+ SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
+ NumeratorScaled, Mul);
+
+ SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
+
+ SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
+ NumeratorScaled, Fma3);
+
+ if (!Subtarget->hasFP32Denormals()) {
+ const SDValue DisableDenormValue =
+ DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
+ SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
+ Fma4.getValue(1),
+ DisableDenormValue,
+ BitField,
+ Fma4.getValue(2));
+
+ SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+ DisableDenorm, DAG.getRoot());
+ DAG.setRoot(OutputChain);
+ }
SDValue Scale = NumeratorScaled.getValue(1);
- SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
+ SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
+ Fma4, Fma1, Fma3, Scale);
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
}
@@ -2288,6 +3257,9 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::f64)
return LowerFDIV64(Op, DAG);
+ if (VT == MVT::f16)
+ return LowerFDIV16(Op, DAG);
+
llvm_unreachable("Unexpected type for fdiv");
}
@@ -2311,6 +3283,14 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return expandUnalignedStore(Store, DAG);
}
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ // If there is a possibilty that flat instruction access scratch memory
+ // then we need to use the same legalization rules we use for private.
+ if (AS == AMDGPUAS::FLAT_ADDRESS)
+ AS = MFI->hasFlatScratchInit() ?
+ AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
+
unsigned NumElements = VT.getVectorNumElements();
switch (AS) {
case AMDGPUAS::GLOBAL_ADDRESS:
@@ -2504,23 +3484,83 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
}
+SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SDValue Ptr = N->getBasePtr();
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
+ // TODO: We could also do this for multiplies.
+ unsigned AS = N->getAddressSpace();
+ if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
+ SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
+ if (NewPtr) {
+ SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
+
+ NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
+ return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+ }
+ }
+
+ return SDValue();
+}
+
+static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
+ return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
+ (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
+ (Opc == ISD::XOR && Val == 0);
+}
+
+// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
+// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
+// integer combine opportunities since most 64-bit operations are decomposed
+// this way. TODO: We won't want this for SALU especially if it is an inline
+// immediate.
+SDValue SITargetLowering::splitBinaryBitConstantOp(
+ DAGCombinerInfo &DCI,
+ const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ const ConstantSDNode *CRHS) const {
+ uint64_t Val = CRHS->getZExtValue();
+ uint32_t ValLo = Lo_32(Val);
+ uint32_t ValHi = Hi_32(Val);
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
+ bitOpWithConstantIsReducible(Opc, ValHi)) ||
+ (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
+ // If we need to materialize a 64-bit immediate, it will be split up later
+ // anyway. Avoid creating the harder to understand 64-bit immediate
+ // materialization.
+ return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
+ }
+
+ return SDValue();
+}
+
SDValue SITargetLowering::performAndCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (DCI.isBeforeLegalize())
return SDValue();
- if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI))
- return Base;
-
SelectionDAG &DAG = DCI.DAG;
-
- // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
- // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
+ EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
- if (LHS.getOpcode() == ISD::SETCC &&
- RHS.getOpcode() == ISD::SETCC) {
+
+ if (VT == MVT::i64) {
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (CRHS) {
+ if (SDValue Split
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
+ return Split;
+ }
+ }
+
+ // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
+ // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
+ if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
@@ -2568,54 +3608,85 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
SDValue RHS = N->getOperand(1);
EVT VT = N->getValueType(0);
- if (VT == MVT::i64) {
- // TODO: This could be a generic combine with a predicate for extracting the
- // high half of an integer being free.
-
- // (or i64:x, (zero_extend i32:y)) ->
- // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
- if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
- RHS.getOpcode() != ISD::ZERO_EXTEND)
- std::swap(LHS, RHS);
-
- if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
- SDValue ExtSrc = RHS.getOperand(0);
- EVT SrcVT = ExtSrc.getValueType();
- if (SrcVT == MVT::i32) {
- SDLoc SL(N);
- SDValue LowLHS, HiBits;
- std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
- SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
-
- DCI.AddToWorklist(LowOr.getNode());
- DCI.AddToWorklist(HiBits.getNode());
-
- SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
- LowOr, HiBits);
- return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
- }
+ if (VT == MVT::i1) {
+ // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
+ if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
+ RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
+ SDValue Src = LHS.getOperand(0);
+ if (Src != RHS.getOperand(0))
+ return SDValue();
+
+ const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+ if (!CLHS || !CRHS)
+ return SDValue();
+
+ // Only 10 bits are used.
+ static const uint32_t MaxMask = 0x3ff;
+
+ uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
+ Src, DAG.getConstant(NewMask, DL, MVT::i32));
}
+
+ return SDValue();
}
- // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
- if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
- RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
- SDValue Src = LHS.getOperand(0);
- if (Src != RHS.getOperand(0))
- return SDValue();
+ if (VT != MVT::i64)
+ return SDValue();
- const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
- const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
- if (!CLHS || !CRHS)
- return SDValue();
+ // TODO: This could be a generic combine with a predicate for extracting the
+ // high half of an integer being free.
+
+ // (or i64:x, (zero_extend i32:y)) ->
+ // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
+ if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getOpcode() != ISD::ZERO_EXTEND)
+ std::swap(LHS, RHS);
+
+ if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
+ SDValue ExtSrc = RHS.getOperand(0);
+ EVT SrcVT = ExtSrc.getValueType();
+ if (SrcVT == MVT::i32) {
+ SDLoc SL(N);
+ SDValue LowLHS, HiBits;
+ std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
+ SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
+
+ DCI.AddToWorklist(LowOr.getNode());
+ DCI.AddToWorklist(HiBits.getNode());
+
+ SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+ LowOr, HiBits);
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+ }
+ }
- // Only 10 bits are used.
- static const uint32_t MaxMask = 0x3ff;
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (CRHS) {
+ if (SDValue Split
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
+ return Split;
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performXorCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
- uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
- SDLoc DL(N);
- return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
- Src, DAG.getConstant(NewMask, DL, MVT::i32));
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (CRHS) {
+ if (SDValue Split
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
+ return Split;
}
return SDValue();
@@ -2657,6 +3728,9 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
return DAG.getConstantFP(0.0, SDLoc(N), VT);
+
+ if (VT == MVT::f16 && !Subtarget->hasFP16Denormals())
+ return DAG.getConstantFP(0.0, SDLoc(N), VT);
}
if (C.isNaN()) {
@@ -2716,8 +3790,23 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
}
EVT VT = K0->getValueType(0);
- return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
- Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+
+ MVT NVT = MVT::i32;
+ unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+ SDValue Tmp1, Tmp2, Tmp3;
+ Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
+ Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
+ Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
+
+ if (VT == MVT::i16) {
+ Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
+ Tmp1, Tmp2, Tmp3);
+
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
+ } else
+ return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
+ Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
}
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
@@ -2814,6 +3903,119 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
return SDValue();
}
+unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
+ const SDNode *N0,
+ const SDNode *N1) const {
+ EVT VT = N0->getValueType(0);
+
+ // Only do this if we are not trying to support denormals. v_mad_f32 does not
+ // support denormals ever.
+ if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
+ (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
+ return ISD::FMAD;
+
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ Options.UnsafeFPMath ||
+ (cast<BinaryWithFlagsSDNode>(N0)->Flags.hasUnsafeAlgebra() &&
+ cast<BinaryWithFlagsSDNode>(N1)->Flags.hasUnsafeAlgebra())) &&
+ isFMAFasterThanFMulAndFAdd(VT)) {
+ return ISD::FMA;
+ }
+
+ return 0;
+}
+
+SDValue SITargetLowering::performFAddCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ assert(!VT.isVector());
+
+ SDLoc SL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // These should really be instruction patterns, but writing patterns with
+ // source modiifiers is a pain.
+
+ // fadd (fadd (a, a), b) -> mad 2.0, a, b
+ if (LHS.getOpcode() == ISD::FADD) {
+ SDValue A = LHS.getOperand(0);
+ if (A == LHS.getOperand(1)) {
+ unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
+ if (FusedOp != 0) {
+ const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
+ return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
+ }
+ }
+ }
+
+ // fadd (b, fadd (a, a)) -> mad 2.0, a, b
+ if (RHS.getOpcode() == ISD::FADD) {
+ SDValue A = RHS.getOperand(0);
+ if (A == RHS.getOperand(1)) {
+ unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
+ if (FusedOp != 0) {
+ const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
+ return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performFSubCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+ assert(!VT.isVector());
+
+ // Try to get the fneg to fold into the source modifier. This undoes generic
+ // DAG combines and folds them into the mad.
+ //
+ // Only do this if we are not trying to support denormals. v_mad_f32 does
+ // not support denormals ever.
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ if (LHS.getOpcode() == ISD::FADD) {
+ // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
+ SDValue A = LHS.getOperand(0);
+ if (A == LHS.getOperand(1)) {
+ unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
+ if (FusedOp != 0){
+ const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
+ SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+
+ return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
+ }
+ }
+ }
+
+ if (RHS.getOpcode() == ISD::FADD) {
+ // (fsub c, (fadd a, a)) -> mad -2.0, a, c
+
+ SDValue A = RHS.getOperand(0);
+ if (A == RHS.getOperand(1)) {
+ unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
+ if (FusedOp != 0){
+ const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
+ return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
SDValue SITargetLowering::performSetCCCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -2823,7 +4025,8 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
SDValue RHS = N->getOperand(1);
EVT VT = LHS.getValueType();
- if (VT != MVT::f32 && VT != MVT::f64)
+ if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
+ VT != MVT::f16))
return SDValue();
// Match isinf pattern
@@ -2845,14 +4048,59 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
return SDValue();
}
-SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
+SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
- SDLoc DL(N);
+ SDLoc SL(N);
+ unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
+
+ SDValue Src = N->getOperand(0);
+ SDValue Srl = N->getOperand(0);
+ if (Srl.getOpcode() == ISD::ZERO_EXTEND)
+ Srl = Srl.getOperand(0);
+
+ // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
+ if (Srl.getOpcode() == ISD::SRL) {
+ // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
+ // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
+ // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
+
+ if (const ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+ Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
+ EVT(MVT::i32));
+
+ unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
+ if (SrcOffset < 32 && SrcOffset % 8 == 0) {
+ return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
+ MVT::f32, Srl);
+ }
+ }
+ }
+
+ APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+ TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
default:
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+ case ISD::FADD:
+ return performFAddCombine(N, DCI);
+ case ISD::FSUB:
+ return performFSubCombine(N, DCI);
case ISD::SETCC:
return performSetCCCombine(N, DCI);
case ISD::FMAXNUM:
@@ -2869,127 +4117,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performMinMaxCombine(N, DCI);
break;
}
-
- case AMDGPUISD::CVT_F32_UBYTE0:
- case AMDGPUISD::CVT_F32_UBYTE1:
- case AMDGPUISD::CVT_F32_UBYTE2:
- case AMDGPUISD::CVT_F32_UBYTE3: {
- unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
- SDValue Src = N->getOperand(0);
-
- // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
- if (Src.getOpcode() == ISD::SRL) {
- // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
- // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
- // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
-
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
- unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
- if (SrcOffset < 32 && SrcOffset % 8 == 0) {
- return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL,
- MVT::f32, Src.getOperand(0));
- }
- }
- }
-
- APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
-
- APInt KnownZero, KnownOne;
- TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
- !DCI.isBeforeLegalizeOps());
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
- TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
- DCI.CommitTargetLoweringOpt(TLO);
- }
-
- break;
- }
-
- case ISD::UINT_TO_FP: {
- return performUCharToFloatCombine(N, DCI);
- }
- case ISD::FADD: {
- if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
- break;
-
- EVT VT = N->getValueType(0);
- if (VT != MVT::f32)
- break;
-
- // Only do this if we are not trying to support denormals. v_mad_f32 does
- // not support denormals ever.
- if (Subtarget->hasFP32Denormals())
- break;
-
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
-
- // These should really be instruction patterns, but writing patterns with
- // source modiifiers is a pain.
-
- // fadd (fadd (a, a), b) -> mad 2.0, a, b
- if (LHS.getOpcode() == ISD::FADD) {
- SDValue A = LHS.getOperand(0);
- if (A == LHS.getOperand(1)) {
- const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
- return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS);
- }
- }
-
- // fadd (b, fadd (a, a)) -> mad 2.0, a, b
- if (RHS.getOpcode() == ISD::FADD) {
- SDValue A = RHS.getOperand(0);
- if (A == RHS.getOperand(1)) {
- const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
- return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS);
- }
- }
-
- return SDValue();
- }
- case ISD::FSUB: {
- if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
- break;
-
- EVT VT = N->getValueType(0);
-
- // Try to get the fneg to fold into the source modifier. This undoes generic
- // DAG combines and folds them into the mad.
- //
- // Only do this if we are not trying to support denormals. v_mad_f32 does
- // not support denormals ever.
- if (VT == MVT::f32 &&
- !Subtarget->hasFP32Denormals()) {
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
- if (LHS.getOpcode() == ISD::FADD) {
- // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
-
- SDValue A = LHS.getOperand(0);
- if (A == LHS.getOperand(1)) {
- const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
- SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
-
- return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS);
- }
- }
-
- if (RHS.getOpcode() == ISD::FADD) {
- // (fsub c, (fadd a, a)) -> mad -2.0, a, c
-
- SDValue A = RHS.getOperand(0);
- if (A == RHS.getOperand(1)) {
- const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32);
- return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS);
- }
- }
-
- return SDValue();
- }
-
- break;
- }
case ISD::LOAD:
case ISD::STORE:
case ISD::ATOMIC_LOAD:
@@ -3011,27 +4138,14 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
if (DCI.isBeforeLegalize())
break;
-
- MemSDNode *MemNode = cast<MemSDNode>(N);
- SDValue Ptr = MemNode->getBasePtr();
-
- // TODO: We could also do this for multiplies.
- unsigned AS = MemNode->getAddressSpace();
- if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
- SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
- if (NewPtr) {
- SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end());
-
- NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
- return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
- }
- }
- break;
+ return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
}
case ISD::AND:
return performAndCombine(N, DCI);
case ISD::OR:
return performOrCombine(N, DCI);
+ case ISD::XOR:
+ return performXorCombine(N, DCI);
case AMDGPUISD::FP_CLASS:
return performClassCombine(N, DCI);
case ISD::FCANONICALIZE:
@@ -3039,6 +4153,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::FRACT:
case AMDGPUISD::RCP:
case AMDGPUISD::RSQ:
+ case AMDGPUISD::RCP_LEGACY:
case AMDGPUISD::RSQ_LEGACY:
case AMDGPUISD::RSQ_CLAMP:
case AMDGPUISD::LDEXP: {
@@ -3047,38 +4162,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return Src;
break;
}
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ return performUCharToFloatCombine(N, DCI);
+ case AMDGPUISD::CVT_F32_UBYTE0:
+ case AMDGPUISD::CVT_F32_UBYTE1:
+ case AMDGPUISD::CVT_F32_UBYTE2:
+ case AMDGPUISD::CVT_F32_UBYTE3:
+ return performCvtF32UByteNCombine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
-/// \brief Analyze the possible immediate value Op
-///
-/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
-/// and the immediate value if it's a literal immediate
-int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
- const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
-
- if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
- if (TII->isInlineConstant(Node->getAPIntValue()))
- return 0;
-
- uint64_t Val = Node->getZExtValue();
- return isUInt<32>(Val) ? Val : -1;
- }
-
- if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
- if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt()))
- return 0;
-
- if (Node->getValueType(0) == MVT::f32)
- return FloatToBits(Node->getValueAPF().convertToFloat());
-
- return -1;
- }
-
- return -1;
-}
-
/// \brief Helper function for adjustWritemask
static unsigned SubIdx2Lane(unsigned Idx) {
switch (Idx) {
@@ -3235,13 +4330,16 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
if (TII->isMIMG(MI)) {
unsigned VReg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(VReg);
+ // TODO: Need mapping tables to handle other cases (register classes).
+ if (RC != &AMDGPU::VReg_128RegClass)
+ return;
+
unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
unsigned BitsSet = 0;
for (unsigned i = 0; i < 4; ++i)
BitsSet += Writemask & (1 << i) ? 1 : 0;
-
- const TargetRegisterClass *RC;
switch (BitsSet) {
default: return;
case 1: RC = &AMDGPU::VGPR_32RegClass; break;
@@ -3379,6 +4477,8 @@ std::pair<unsigned, const TargetRegisterClass *>
SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
+ if (!isTypeLegal(VT))
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
if (Constraint.size() == 1) {
switch (Constraint[0]) {
@@ -3388,7 +4488,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
default:
return std::make_pair(0U, nullptr);
case 32:
- return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
+ case 16:
+ return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
case 64:
return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
case 128:
@@ -3402,6 +4503,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
default:
return std::make_pair(0U, nullptr);
case 32:
+ case 16:
return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
case 64:
return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1d349fa..6c04e4f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -24,7 +24,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain,
unsigned Offset) const;
SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
- SDValue Chain, unsigned Offset, bool Signed) const;
+ SDValue Chain, unsigned Offset, bool Signed,
+ const ISD::InputArg *Arg = nullptr) const;
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
@@ -33,11 +34,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
@@ -47,6 +48,16 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+ /// \brief Converts \p Op, which must be of floating point type, to the
+ /// floating point type \p VT, by either extending or truncating it.
+ SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,
+ SDValue Op,
+ const SDLoc &DL,
+ EVT VT) const;
+
+ /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16.
+ SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+
SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
@@ -58,14 +69,27 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performSHLPtrCombine(SDNode *N,
unsigned AS,
DAGCombinerInfo &DCI) const;
+
+ SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const;
+
+ SDValue splitBinaryBitConstantOp(DAGCombinerInfo &DCI, const SDLoc &SL,
+ unsigned Opc, SDValue LHS,
+ const ConstantSDNode *CRHS) const;
+
SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ unsigned getFusedOpcode(const SelectionDAG &DAG,
+ const SDNode *N0, const SDNode *N1) const;
+ SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
@@ -73,6 +97,19 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool isCFIntrinsic(const SDNode *Intr) const;
void createDebuggerPrologueStackObjects(MachineFunction &MF) const;
+
+ /// \returns True if fixup needs to be emitted for given global value \p GV,
+ /// false otherwise.
+ bool shouldEmitFixup(const GlobalValue *GV) const;
+
+ /// \returns True if GOT relocation needs to be emitted for given global value
+ /// \p GV, false otherwise.
+ bool shouldEmitGOTReloc(const GlobalValue *GV) const;
+
+ /// \returns True if PC-relative relocation needs to be emitted for given
+ /// global value \p GV, false otherwise.
+ bool shouldEmitPCReloc(const GlobalValue *GV) const;
+
public:
SITargetLowering(const TargetMachine &tm, const SISubtarget &STI);
@@ -98,7 +135,9 @@ public:
MachineFunction &MF) const override;
bool isMemOpUniform(const SDNode *N) const;
+ bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+ bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const override;
@@ -141,7 +180,6 @@ public:
void AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const override;
- int32_t analyzeImmediate(const SDNode *N) const;
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
unsigned Reg, EVT VT) const override;
void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
new file mode 100644
index 0000000..91e4bf7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -0,0 +1,329 @@
+//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass inserts branches on the 0 exec mask over divergent branches
+/// branches when it's expected that jumping over the untaken control flow will
+/// be cheaper than having every workitem no-op through it.
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-insert-skips"
+
+namespace {
+
+static cl::opt<unsigned> SkipThresholdFlag(
+ "amdgpu-skip-threshold",
+ cl::desc("Number of instructions before jumping over divergent control flow"),
+ cl::init(12), cl::Hidden);
+
+class SIInsertSkips : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI;
+ const SIInstrInfo *TII;
+ unsigned SkipThreshold;
+
+ bool shouldSkip(const MachineBasicBlock &From,
+ const MachineBasicBlock &To) const;
+
+ bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
+
+ void kill(MachineInstr &MI);
+
+ MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+
+ bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
+
+public:
+ static char ID;
+
+ SIInsertSkips() :
+ MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI insert s_cbranch_execz instructions";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace
+
+char SIInsertSkips::ID = 0;
+
+INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
+ "SI insert s_cbranch_execz instructions", false, false)
+
+char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
+
+static bool opcodeEmitsNoInsts(unsigned Opc) {
+ switch (Opc) {
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ case TargetOpcode::BUNDLE:
+ case TargetOpcode::CFI_INSTRUCTION:
+ case TargetOpcode::EH_LABEL:
+ case TargetOpcode::GC_LABEL:
+ case TargetOpcode::DBG_VALUE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
+ const MachineBasicBlock &To) const {
+ if (From.succ_empty())
+ return false;
+
+ unsigned NumInstr = 0;
+ const MachineFunction *MF = From.getParent();
+
+ for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+ MBBI != End && MBBI != ToI; ++MBBI) {
+ const MachineBasicBlock &MBB = *MBBI;
+
+ for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
+ NumInstr < SkipThreshold && I != E; ++I) {
+ if (opcodeEmitsNoInsts(I->getOpcode()))
+ continue;
+
+ // FIXME: Since this is required for correctness, this should be inserted
+ // during SILowerControlFlow.
+
+ // When a uniform loop is inside non-uniform control flow, the branch
+ // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
+ // when EXEC = 0. We should skip the loop lest it becomes infinite.
+ if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
+ I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
+ return true;
+
+ if (I->isInlineAsm()) {
+ const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+ const char *AsmStr = I->getOperand(0).getSymbolName();
+
+ // inlineasm length estimate is number of bytes assuming the longest
+ // instruction.
+ uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
+ NumInstr += MaxAsmSize / MAI->getMaxInstLength();
+ } else {
+ ++NumInstr;
+ }
+
+ if (NumInstr >= SkipThreshold)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction *MF = MBB.getParent();
+
+ if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
+ !shouldSkip(MBB, MBB.getParent()->back()))
+ return false;
+
+ MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
+
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ // If the exec mask is non-zero, skip the next two instructions
+ BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addMBB(&NextBB);
+
+ MachineBasicBlock::iterator Insert = SkipBB->begin();
+
+ // Exec mask is zero: Export to NULL target...
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
+ .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addImm(1) // vm
+ .addImm(0) // compr
+ .addImm(0); // en
+
+ // ... and terminate wavefront.
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+
+ return true;
+}
+
+void SIInsertSkips::kill(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ const MachineOperand &Op = MI.getOperand(0);
+
+#ifndef NDEBUG
+ CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
+ // Kill is only allowed in pixel / geometry shaders.
+ assert(CallConv == CallingConv::AMDGPU_PS ||
+ CallConv == CallingConv::AMDGPU_GS);
+#endif
+ // Clear this thread from the exec mask if the operand is negative.
+ if (Op.isImm()) {
+ // Constant operand: Set exec mask to 0 or do nothing
+ if (Op.getImm() & 0x80000000) {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addImm(0);
+ }
+ } else {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
+ .addImm(0)
+ .addOperand(Op);
+ }
+}
+
+MachineBasicBlock *SIInsertSkips::insertSkipBlock(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+ MachineFunction *MF = MBB.getParent();
+
+ MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF->insert(MBBI, SkipBB);
+ MBB.addSuccessor(SkipBB);
+
+ return SkipBB;
+}
+
+// Returns true if a branch over the block was inserted.
+bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
+ MachineBasicBlock &SrcMBB) {
+ MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
+
+ if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
+
+ BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+ .addMBB(DestBB);
+
+ return true;
+}
+
+bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ SkipThreshold = SkipThresholdFlag;
+
+ bool HaveKill = false;
+ bool MadeChange = false;
+
+ // Track depth of exec mask, divergent branches.
+ SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
+
+ MachineFunction::iterator NextBB;
+
+ MachineBasicBlock *EmptyMBBAtEnd = nullptr;
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; BI = NextBB) {
+ NextBB = std::next(BI);
+ MachineBasicBlock &MBB = *BI;
+
+ if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
+ // Reached convergence point for last divergent branch.
+ ExecBranchStack.pop_back();
+ }
+
+ if (HaveKill && ExecBranchStack.empty()) {
+ HaveKill = false;
+
+ // TODO: Insert skip if exec is 0?
+ }
+
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+
+ MachineInstr &MI = *I;
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_MASK_BRANCH: {
+ ExecBranchStack.push_back(MI.getOperand(0).getMBB());
+ MadeChange |= skipMaskBranch(MI, MBB);
+ break;
+ }
+ case AMDGPU::S_BRANCH: {
+ // Optimize out branches to the next block.
+ // FIXME: Shouldn't this be handled by BranchFolding?
+ if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB()))
+ MI.eraseFromParent();
+ break;
+ }
+ case AMDGPU::SI_KILL_TERMINATOR: {
+ MadeChange = true;
+ kill(MI);
+
+ if (ExecBranchStack.empty()) {
+ if (skipIfDead(MI, *NextBB)) {
+ NextBB = std::next(BI);
+ BE = MF.end();
+ Next = MBB.end();
+ }
+ } else {
+ HaveKill = true;
+ }
+
+ MI.eraseFromParent();
+ break;
+ }
+ case AMDGPU::SI_RETURN: {
+ // FIXME: Should move somewhere else
+ assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+ // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+ // because external bytecode will be appended at the end.
+ if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
+ // SI_RETURN is not the last instruction. Add an empty block at
+ // the end and jump there.
+ if (!EmptyMBBAtEnd) {
+ EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), EmptyMBBAtEnd);
+ }
+
+ MBB.addSuccessor(EmptyMBBAtEnd);
+ BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(EmptyMBBAtEnd);
+ I->eraseFromParent();
+ }
+ }
+ default:
+ break;
+ }
+ }
+ }
+
+ return MadeChange;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
index d24588d..fceabd7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -21,6 +21,7 @@
#include "SIDefines.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -29,6 +30,7 @@
#define DEBUG_TYPE "si-insert-waits"
using namespace llvm;
+using namespace llvm::AMDGPU;
namespace {
@@ -59,13 +61,14 @@ private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
const MachineRegisterInfo *MRI;
-
- /// \brief Constant hardware limits
- static const Counters WaitCounts;
+ IsaVersion IV;
/// \brief Constant zero value
static const Counters ZeroCounts;
+ /// \brief Hardware limits
+ Counters HardwareLimits;
+
/// \brief Counter values we have already waited on.
Counters WaitedOn;
@@ -90,6 +93,9 @@ private:
bool LastInstWritesM0;
+ /// Whether or not we have flat operations outstanding.
+ bool IsFlatOutstanding;
+
/// \brief Whether the machine function returns void
bool ReturnsVoid;
@@ -145,7 +151,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "SI insert wait instructions";
}
@@ -170,11 +176,12 @@ FunctionPass *llvm::createSIInsertWaitsPass() {
return new SIInsertWaits();
}
-const Counters SIInsertWaits::WaitCounts = { { 15, 7, 15 } };
const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
-static bool readsVCCZ(unsigned Opcode) {
- return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCZ;
+static bool readsVCCZ(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+ !MI.getOperand(1).isUndef();
}
bool SIInsertWaits::hasOutstandingLGKM() const {
@@ -188,8 +195,7 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
// Only consider stores or EXP for EXP_CNT
- Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
- (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
+ Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
// LGKM may uses larger values
if (TSFlags & SIInstrFlags::LGKM_CNT) {
@@ -231,9 +237,10 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
if (Op.isDef())
return true;
- // For exports all registers are relevant
+ // For exports all registers are relevant.
+ // TODO: Skip undef/disabled registers.
MachineInstr &MI = *Op.getParent();
- if (MI.getOpcode() == AMDGPU::EXP)
+ if (TII->isEXP(MI))
return true;
// For stores the stored value is also relevant
@@ -245,12 +252,6 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
// operand comes before the value operand and it may have
// multiple data operands.
- if (TII->isDS(MI) || TII->isFLAT(MI)) {
- MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
- if (Data && Op.isIdenticalTo(*Data))
- return true;
- }
-
if (TII->isDS(MI)) {
MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
if (Data0 && Op.isIdenticalTo(*Data0))
@@ -260,6 +261,12 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
return Data1 && Op.isIdenticalTo(*Data1);
}
+ if (TII->isFLAT(MI)) {
+ MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+ if (Data && Op.isIdenticalTo(*Data))
+ return true;
+ }
+
// NOTE: This assumes that the value operand is before the
// address operand, and that there is only one value operand.
for (MachineInstr::mop_iterator I = MI.operands_begin(),
@@ -292,6 +299,9 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
Counters Limit = ZeroCounts;
unsigned Sum = 0;
+ if (TII->mayAccessFlatAddressSpace(*I))
+ IsFlatOutstanding = true;
+
for (unsigned i = 0; i < 3; ++i) {
LastIssued.Array[i] += Increment.Array[i];
if (Increment.Array[i])
@@ -330,7 +340,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
// Remember which export instructions we have seen
if (Increment.Named.EXP) {
- ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
+ ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
}
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
@@ -366,8 +376,9 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
// Figure out if the async instructions execute in order
bool Ordered[3];
- // VM_CNT is always ordered
- Ordered[0] = true;
+ // VM_CNT is always ordered except when there are flat instructions, which
+ // can return out of order.
+ Ordered[0] = !IsFlatOutstanding;
// EXP_CNT is unordered if we have both EXP & VM-writes
Ordered[1] = ExpInstrTypesSeen == 3;
@@ -376,7 +387,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
Ordered[2] = false;
// The values we are going to put into the S_WAITCNT instruction
- Counters Counts = WaitCounts;
+ Counters Counts = HardwareLimits;
// Do we really need to wait?
bool NeedWait = false;
@@ -392,7 +403,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
unsigned Value = LastIssued.Array[i] - Required.Array[i];
// Adjust the value to the real hardware possibilities.
- Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
+ Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
} else
Counts.Array[i] = 0;
@@ -410,12 +421,14 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
// Build the wait instruction
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
- .addImm((Counts.Named.VM & 0xF) |
- ((Counts.Named.EXP & 0x7) << 4) |
- ((Counts.Named.LGKM & 0xF) << 8));
+ .addImm(encodeWaitcnt(IV,
+ Counts.Named.VM,
+ Counts.Named.EXP,
+ Counts.Named.LGKM));
LastOpcodeType = OTHER;
LastInstWritesM0 = false;
+ IsFlatOutstanding = false;
return true;
}
@@ -440,9 +453,9 @@ void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
unsigned Imm = I->getOperand(0).getImm();
Counters Counts, WaitOn;
- Counts.Named.VM = Imm & 0xF;
- Counts.Named.EXP = (Imm >> 4) & 0x7;
- Counts.Named.LGKM = (Imm >> 8) & 0xF;
+ Counts.Named.VM = decodeVmcnt(IV, Imm);
+ Counts.Named.EXP = decodeExpcnt(IV, Imm);
+ Counts.Named.LGKM = decodeLgkmcnt(IV, Imm);
for (unsigned i = 0; i < 3; ++i) {
if (Counts.Array[i] <= LastIssued.Array[i])
@@ -491,7 +504,7 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
return;
// There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
- if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
+ if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
LastInstWritesM0 = false;
return;
@@ -518,26 +531,40 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
+ IV = getIsaVersion(ST->getFeatureBits());
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ HardwareLimits.Named.VM = getVmcntBitMask(IV);
+ HardwareLimits.Named.EXP = getExpcntBitMask(IV);
+ HardwareLimits.Named.LGKM = getLgkmcntBitMask(IV);
WaitedOn = ZeroCounts;
DelayedWaitOn = ZeroCounts;
LastIssued = ZeroCounts;
LastOpcodeType = OTHER;
LastInstWritesM0 = false;
- ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid();
+ IsFlatOutstanding = false;
+ ReturnsVoid = MFI->returnsVoid();
memset(&UsedRegs, 0, sizeof(UsedRegs));
memset(&DefinedRegs, 0, sizeof(DefinedRegs));
SmallVector<MachineInstr *, 4> RemoveMI;
+ SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+
+ bool HaveScalarStores = false;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;
+
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
+ if (!HaveScalarStores && TII->isScalarStore(*I))
+ HaveScalarStores = true;
+
if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
// There is a hardware bug on CI/SI where SMRD instruction may corrupt
// vccz bit, so when we detect that an instruction may read from a
@@ -557,7 +584,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
}
// Check if we need to apply the bug work-around
- if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) {
+ if (VCCZCorrupt && readsVCCZ(*I)) {
DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
// Wait on everything, not just LGKM. vccz reads usually come from
@@ -572,7 +599,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
// vcc and then writing it back to the register.
BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
AMDGPU::VCC)
- .addReg(AMDGPU::VCC);
+ .addReg(AMDGPU::VCC);
}
}
@@ -590,8 +617,10 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
// S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
// but we also want to wait for any other outstanding transfers before
// signalling other hardware blocks
- if (I->getOpcode() == AMDGPU::S_BARRIER ||
- I->getOpcode() == AMDGPU::S_SENDMSG)
+ if ((I->getOpcode() == AMDGPU::S_BARRIER &&
+ ST->needWaitcntBeforeBarrier()) ||
+ I->getOpcode() == AMDGPU::S_SENDMSG ||
+ I->getOpcode() == AMDGPU::S_SENDMSGHALT)
Required = LastIssued;
else
Required = handleOperands(*I);
@@ -605,12 +634,45 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
pushInstruction(MBB, I, Increment);
handleSendMsg(MBB, I);
+
+ if (I->getOpcode() == AMDGPU::S_ENDPGM ||
+ I->getOpcode() == AMDGPU::SI_RETURN)
+ EndPgmBlocks.push_back(&MBB);
}
// Wait for everything at the end of the MBB
Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
}
+ if (HaveScalarStores) {
+ // If scalar writes are used, the cache must be flushed or else the next
+ // wave to reuse the same scratch memory can be clobbered.
+ //
+ // Insert s_dcache_wb at wave termination points if there were any scalar
+ // stores, and only if the cache hasn't already been flushed. This could be
+ // improved by looking across blocks for flushes in postdominating blocks
+ // from the stores but an explicitly requested flush is probably very rare.
+ for (MachineBasicBlock *MBB : EndPgmBlocks) {
+ bool SeenDCacheWB = false;
+
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+ I != E; ++I) {
+
+ if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+ SeenDCacheWB = true;
+ else if (TII->isScalarStore(*I))
+ SeenDCacheWB = false;
+
+ // FIXME: It would be better to insert this before a waitcnt if any.
+ if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+ I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) {
+ Changes = true;
+ BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+ }
+ }
+ }
+ }
+
for (MachineInstr *I : RemoveMI)
I->eraseFromParent();
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 6163f05..5523ec1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -15,78 +15,111 @@ class InstSI <dag outs, dag ins, string asm = "",
list<dag> pattern = []> :
AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
- field bits<1> VM_CNT = 0;
- field bits<1> EXP_CNT = 0;
- field bits<1> LGKM_CNT = 0;
-
- field bits<1> SALU = 0;
- field bits<1> VALU = 0;
-
- field bits<1> SOP1 = 0;
- field bits<1> SOP2 = 0;
- field bits<1> SOPC = 0;
- field bits<1> SOPK = 0;
- field bits<1> SOPP = 0;
-
- field bits<1> VOP1 = 0;
- field bits<1> VOP2 = 0;
- field bits<1> VOP3 = 0;
- field bits<1> VOPC = 0;
- field bits<1> SDWA = 0;
- field bits<1> DPP = 0;
-
- field bits<1> MUBUF = 0;
- field bits<1> MTBUF = 0;
- field bits<1> SMRD = 0;
- field bits<1> DS = 0;
- field bits<1> MIMG = 0;
- field bits<1> FLAT = 0;
+ // Low bits - basic encoding information.
+ field bit SALU = 0;
+ field bit VALU = 0;
+
+ // SALU instruction formats.
+ field bit SOP1 = 0;
+ field bit SOP2 = 0;
+ field bit SOPC = 0;
+ field bit SOPK = 0;
+ field bit SOPP = 0;
+
+ // VALU instruction formats.
+ field bit VOP1 = 0;
+ field bit VOP2 = 0;
+ field bit VOPC = 0;
+ field bit VOP3 = 0;
+ field bit VINTRP = 0;
+ field bit SDWA = 0;
+ field bit DPP = 0;
+
+ // Memory instruction formats.
+ field bit MUBUF = 0;
+ field bit MTBUF = 0;
+ field bit SMRD = 0;
+ field bit MIMG = 0;
+ field bit EXP = 0;
+ field bit FLAT = 0;
+ field bit DS = 0;
+
+ // Pseudo instruction formats.
+ field bit VGPRSpill = 0;
+ field bit SGPRSpill = 0;
+
+ // High bits - other information.
+ field bit VM_CNT = 0;
+ field bit EXP_CNT = 0;
+ field bit LGKM_CNT = 0;
// Whether WQM _must_ be enabled for this instruction.
- field bits<1> WQM = 0;
- field bits<1> VGPRSpill = 0;
+ field bit WQM = 0;
+
+ // Whether WQM _must_ be disabled for this instruction.
+ field bit DisableWQM = 0;
+
+ field bit Gather4 = 0;
+
+ // Most sopk treat the immediate as a signed 16-bit, however some
+ // use it as unsigned.
+ field bit SOPKZext = 0;
+
+ // This is an s_store_dword* instruction that requires a cache flush
+ // on wave termination. It is necessary to distinguish from mayStore
+ // SMEM instructions like the cache flush ones.
+ field bit ScalarStore = 0;
+
+ // Whether the operands can be ignored when computing the
+ // instruction size.
+ field bit FixedSize = 0;
// This bit tells the assembler to use the 32-bit encoding in case it
// is unable to infer the encoding from the operands.
- field bits<1> VOPAsmPrefer32Bit = 0;
+ field bit VOPAsmPrefer32Bit = 0;
- field bits<1> Gather4 = 0;
+ // These need to be kept in sync with the enum in SIInstrFlags.
+ let TSFlags{0} = SALU;
+ let TSFlags{1} = VALU;
- // Whether WQM _must_ be disabled for this instruction.
- field bits<1> DisableWQM = 0;
+ let TSFlags{2} = SOP1;
+ let TSFlags{3} = SOP2;
+ let TSFlags{4} = SOPC;
+ let TSFlags{5} = SOPK;
+ let TSFlags{6} = SOPP;
- // These need to be kept in sync with the enum in SIInstrFlags.
- let TSFlags{0} = VM_CNT;
- let TSFlags{1} = EXP_CNT;
- let TSFlags{2} = LGKM_CNT;
-
- let TSFlags{3} = SALU;
- let TSFlags{4} = VALU;
-
- let TSFlags{5} = SOP1;
- let TSFlags{6} = SOP2;
- let TSFlags{7} = SOPC;
- let TSFlags{8} = SOPK;
- let TSFlags{9} = SOPP;
-
- let TSFlags{10} = VOP1;
- let TSFlags{11} = VOP2;
- let TSFlags{12} = VOP3;
- let TSFlags{13} = VOPC;
+ let TSFlags{7} = VOP1;
+ let TSFlags{8} = VOP2;
+ let TSFlags{9} = VOPC;
+ let TSFlags{10} = VOP3;
+
+ let TSFlags{13} = VINTRP;
let TSFlags{14} = SDWA;
let TSFlags{15} = DPP;
let TSFlags{16} = MUBUF;
let TSFlags{17} = MTBUF;
let TSFlags{18} = SMRD;
- let TSFlags{19} = DS;
- let TSFlags{20} = MIMG;
+ let TSFlags{19} = MIMG;
+ let TSFlags{20} = EXP;
let TSFlags{21} = FLAT;
- let TSFlags{22} = WQM;
+ let TSFlags{22} = DS;
+
let TSFlags{23} = VGPRSpill;
- let TSFlags{24} = VOPAsmPrefer32Bit;
- let TSFlags{25} = Gather4;
- let TSFlags{26} = DisableWQM;
+ let TSFlags{24} = SGPRSpill;
+
+ let TSFlags{32} = VM_CNT;
+ let TSFlags{33} = EXP_CNT;
+ let TSFlags{34} = LGKM_CNT;
+
+ let TSFlags{35} = WQM;
+ let TSFlags{36} = DisableWQM;
+ let TSFlags{37} = Gather4;
+
+ let TSFlags{38} = SOPKZext;
+ let TSFlags{39} = ScalarStore;
+ let TSFlags{40} = FixedSize;
+ let TSFlags{41} = VOPAsmPrefer32Bit;
let SchedRW = [Write32Bit];
@@ -95,6 +128,7 @@ class InstSI <dag outs, dag ins, string asm = "",
field bits<1> DisableDecoder = 0;
let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);
+ let AsmVariantName = AMDGPUAsmVariants.Default;
}
class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
@@ -103,376 +137,39 @@ class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
let isCodeGenOnly = 1;
}
-class Enc32 {
- field bits<32> Inst;
- int Size = 4;
-}
-
-class Enc64 {
- field bits<64> Inst;
- int Size = 8;
-}
-
-class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
-
-let Uses = [EXEC] in {
-
-class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern> {
-
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
- let VALU = 1;
-}
-
-class VOPCCommon <dag ins, string asm, list<dag> pattern> :
- VOPAnyCommon <(outs), ins, asm, pattern> {
-
- let VOPC = 1;
- let Size = 4;
- let Defs = [VCC];
-}
-
-class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
- VOPAnyCommon <outs, ins, asm, pattern> {
-
- let VOP1 = 1;
- let Size = 4;
-}
-
-class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> :
- VOPAnyCommon <outs, ins, asm, pattern> {
-
- let VOP2 = 1;
- let Size = 4;
-}
-
-class VOP3Common <dag outs, dag ins, string asm = "",
- list<dag> pattern = [], bit HasMods = 0,
- bit VOP3Only = 0> :
- VOPAnyCommon <outs, ins, asm, pattern> {
-
- // Using complex patterns gives VOP3 patterns a very high complexity rating,
- // but standalone patterns are almost always prefered, so we need to adjust the
- // priority lower. The goal is to use a high number to reduce complexity to
- // zero (or less than zero).
- let AddedComplexity = -1000;
-
- let VOP3 = 1;
- let VALU = 1;
-
- let AsmMatchConverter =
- !if(!eq(VOP3Only,1),
- "cvtVOP3",
- !if(!eq(HasMods,1), "cvtVOP3_2_mod", ""));
-
- let isCodeGenOnly = 0;
-
- int Size = 8;
-
- // Because SGPRs may be allowed if there are multiple operands, we
- // need a post-isel hook to insert copies in order to avoid
- // violating constant bus requirements.
- let hasPostISelHook = 1;
-}
-
-} // End Uses = [EXEC]
-
-//===----------------------------------------------------------------------===//
-// Scalar operations
-//===----------------------------------------------------------------------===//
-
-class SOP1e <bits<8> op> : Enc32 {
- bits<7> sdst;
- bits<8> src0;
-
- let Inst{7-0} = src0;
- let Inst{15-8} = op;
- let Inst{22-16} = sdst;
- let Inst{31-23} = 0x17d; //encoding;
-}
-
-class SOP2e <bits<7> op> : Enc32 {
- bits<7> sdst;
- bits<8> src0;
- bits<8> src1;
-
- let Inst{7-0} = src0;
- let Inst{15-8} = src1;
- let Inst{22-16} = sdst;
- let Inst{29-23} = op;
- let Inst{31-30} = 0x2; // encoding
-}
-
-class SOPCe <bits<7> op> : Enc32 {
- bits<8> src0;
- bits<8> src1;
-
- let Inst{7-0} = src0;
- let Inst{15-8} = src1;
- let Inst{22-16} = op;
- let Inst{31-23} = 0x17e;
-}
-
-class SOPKe <bits<5> op> : Enc32 {
- bits <7> sdst;
- bits <16> simm16;
-
- let Inst{15-0} = simm16;
- let Inst{22-16} = sdst;
- let Inst{27-23} = op;
- let Inst{31-28} = 0xb; //encoding
-}
-
-class SOPK64e <bits<5> op> : Enc64 {
- bits <7> sdst = 0;
- bits <16> simm16;
- bits <32> imm;
-
- let Inst{15-0} = simm16;
- let Inst{22-16} = sdst;
- let Inst{27-23} = op;
- let Inst{31-28} = 0xb;
-
- let Inst{63-32} = imm;
-}
-
-class SOPPe <bits<7> op> : Enc32 {
- bits <16> simm16;
-
- let Inst{15-0} = simm16;
- let Inst{22-16} = op;
- let Inst{31-23} = 0x17f; // encoding
-}
-
-class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
- bits<7> sdst;
- bits<7> sbase;
-
- let Inst{8} = imm;
- let Inst{14-9} = sbase{6-1};
- let Inst{21-15} = sdst;
- let Inst{26-22} = op;
- let Inst{31-27} = 0x18; //encoding
-}
-
-class SMRD_IMMe <bits<5> op> : SMRDe<op, 1> {
- bits<8> offset;
- let Inst{7-0} = offset;
-}
-
-class SMRD_SOFFe <bits<5> op> : SMRDe<op, 0> {
- bits<8> soff;
- let Inst{7-0} = soff;
-}
-
-
-
-class SMRD_IMMe_ci <bits<5> op> : Enc64 {
- bits<7> sdst;
- bits<7> sbase;
- bits<32> offset;
-
- let Inst{7-0} = 0xff;
- let Inst{8} = 0;
- let Inst{14-9} = sbase{6-1};
- let Inst{21-15} = sdst;
- let Inst{26-22} = op;
- let Inst{31-27} = 0x18; //encoding
- let Inst{63-32} = offset;
-}
-
-let SchedRW = [WriteSALU] in {
-class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI<outs, ins, asm, pattern> {
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let isCodeGenOnly = 0;
- let SALU = 1;
- let SOP1 = 1;
-}
-
-class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern> {
-
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let isCodeGenOnly = 0;
- let SALU = 1;
- let SOP2 = 1;
-
- let UseNamedOperandTable = 1;
-}
-
-class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI<outs, ins, asm, pattern>, SOPCe <op> {
-
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
+class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
+ : PseudoInstSI<outs, ins, pattern> {
let SALU = 1;
- let SOPC = 1;
- let isCodeGenOnly = 0;
- let Defs = [SCC];
-
- let UseNamedOperandTable = 1;
}
-class SOPK <dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins , asm, pattern> {
-
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let SALU = 1;
- let SOPK = 1;
-
- let UseNamedOperandTable = 1;
+class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
+ : PseudoInstSI<outs, ins, pattern> {
+ let VALU = 1;
+ let Uses = [EXEC];
}
-class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
- InstSI <(outs), ins, asm, pattern >, SOPPe <op> {
+class CFPseudoInstSI<dag outs, dag ins, list<dag> pattern = [],
+ bit UseExec = 0, bit DefExec = 0> :
+ SPseudoInstSI<outs, ins, pattern> {
+ let Uses = !if(UseExec, [EXEC], []);
+ let Defs = !if(DefExec, [EXEC, SCC], [SCC]);
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let SALU = 1;
- let SOPP = 1;
-
- let UseNamedOperandTable = 1;
-}
-
-} // let SchedRW = [WriteSALU]
-
-class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI<outs, ins, asm, pattern> {
-
- let LGKM_CNT = 1;
- let SMRD = 1;
- let mayStore = 0;
- let mayLoad = 1;
- let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
- let SchedRW = [WriteSMEM];
-}
-
-//===----------------------------------------------------------------------===//
-// Vector ALU operations
-//===----------------------------------------------------------------------===//
-
-class VOP1e <bits<8> op> : Enc32 {
- bits<8> vdst;
- bits<9> src0;
-
- let Inst{8-0} = src0;
- let Inst{16-9} = op;
- let Inst{24-17} = vdst;
- let Inst{31-25} = 0x3f; //encoding
-}
-
-class VOP2e <bits<6> op> : Enc32 {
- bits<8> vdst;
- bits<9> src0;
- bits<8> src1;
-
- let Inst{8-0} = src0;
- let Inst{16-9} = src1;
- let Inst{24-17} = vdst;
- let Inst{30-25} = op;
- let Inst{31} = 0x0; //encoding
-}
-
-class VOP2_MADKe <bits<6> op> : Enc64 {
-
- bits<8> vdst;
- bits<9> src0;
- bits<8> src1;
- bits<32> imm;
-
- let Inst{8-0} = src0;
- let Inst{16-9} = src1;
- let Inst{24-17} = vdst;
- let Inst{30-25} = op;
- let Inst{31} = 0x0; // encoding
- let Inst{63-32} = imm;
-}
-
-class VOP3a <bits<9> op> : Enc64 {
- bits<2> src0_modifiers;
- bits<9> src0;
- bits<2> src1_modifiers;
- bits<9> src1;
- bits<2> src2_modifiers;
- bits<9> src2;
- bits<1> clamp;
- bits<2> omod;
-
- let Inst{8} = src0_modifiers{1};
- let Inst{9} = src1_modifiers{1};
- let Inst{10} = src2_modifiers{1};
- let Inst{11} = clamp;
- let Inst{25-17} = op;
- let Inst{31-26} = 0x34; //encoding
- let Inst{40-32} = src0;
- let Inst{49-41} = src1;
- let Inst{58-50} = src2;
- let Inst{60-59} = omod;
- let Inst{61} = src0_modifiers{0};
- let Inst{62} = src1_modifiers{0};
- let Inst{63} = src2_modifiers{0};
-}
-
-class VOP3e <bits<9> op> : VOP3a <op> {
- bits<8> vdst;
-
- let Inst{7-0} = vdst;
}
-// Encoding used for VOPC instructions encoded as VOP3
-// Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
-class VOP3ce <bits<9> op> : VOP3a <op> {
- bits<8> sdst;
-
- let Inst{7-0} = sdst;
+class Enc32 {
+ field bits<32> Inst;
+ int Size = 4;
}
-class VOP3be <bits<9> op> : Enc64 {
- bits<8> vdst;
- bits<2> src0_modifiers;
- bits<9> src0;
- bits<2> src1_modifiers;
- bits<9> src1;
- bits<2> src2_modifiers;
- bits<9> src2;
- bits<7> sdst;
- bits<2> omod;
-
- let Inst{7-0} = vdst;
- let Inst{14-8} = sdst;
- let Inst{25-17} = op;
- let Inst{31-26} = 0x34; //encoding
- let Inst{40-32} = src0;
- let Inst{49-41} = src1;
- let Inst{58-50} = src2;
- let Inst{60-59} = omod;
- let Inst{61} = src0_modifiers{0};
- let Inst{62} = src1_modifiers{0};
- let Inst{63} = src2_modifiers{0};
+class Enc64 {
+ field bits<64> Inst;
+ int Size = 8;
}
-class VOPCe <bits<8> op> : Enc32 {
- bits<9> src0;
- bits<8> src1;
-
- let Inst{8-0} = src0;
- let Inst{16-9} = src1;
- let Inst{24-17} = op;
- let Inst{31-25} = 0x3e;
-}
+class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
class VINTRPe <bits<2> op> : Enc32 {
bits<8> vdst;
@@ -488,88 +185,6 @@ class VINTRPe <bits<2> op> : Enc32 {
let Inst{31-26} = 0x32; // encoding
}
-class DSe <bits<8> op> : Enc64 {
- bits<8> vdst;
- bits<1> gds;
- bits<8> addr;
- bits<8> data0;
- bits<8> data1;
- bits<8> offset0;
- bits<8> offset1;
-
- let Inst{7-0} = offset0;
- let Inst{15-8} = offset1;
- let Inst{17} = gds;
- let Inst{25-18} = op;
- let Inst{31-26} = 0x36; //encoding
- let Inst{39-32} = addr;
- let Inst{47-40} = data0;
- let Inst{55-48} = data1;
- let Inst{63-56} = vdst;
-}
-
-class MUBUFe <bits<7> op> : Enc64 {
- bits<12> offset;
- bits<1> offen;
- bits<1> idxen;
- bits<1> glc;
- bits<1> addr64;
- bits<1> lds;
- bits<8> vaddr;
- bits<8> vdata;
- bits<7> srsrc;
- bits<1> slc;
- bits<1> tfe;
- bits<8> soffset;
-
- let Inst{11-0} = offset;
- let Inst{12} = offen;
- let Inst{13} = idxen;
- let Inst{14} = glc;
- let Inst{15} = addr64;
- let Inst{16} = lds;
- let Inst{24-18} = op;
- let Inst{31-26} = 0x38; //encoding
- let Inst{39-32} = vaddr;
- let Inst{47-40} = vdata;
- let Inst{52-48} = srsrc{6-2};
- let Inst{54} = slc;
- let Inst{55} = tfe;
- let Inst{63-56} = soffset;
-}
-
-class MTBUFe <bits<3> op> : Enc64 {
- bits<8> vdata;
- bits<12> offset;
- bits<1> offen;
- bits<1> idxen;
- bits<1> glc;
- bits<1> addr64;
- bits<4> dfmt;
- bits<3> nfmt;
- bits<8> vaddr;
- bits<7> srsrc;
- bits<1> slc;
- bits<1> tfe;
- bits<8> soffset;
-
- let Inst{11-0} = offset;
- let Inst{12} = offen;
- let Inst{13} = idxen;
- let Inst{14} = glc;
- let Inst{15} = addr64;
- let Inst{18-16} = op;
- let Inst{22-19} = dfmt;
- let Inst{25-23} = nfmt;
- let Inst{31-26} = 0x3a; //encoding
- let Inst{39-32} = vaddr;
- let Inst{47-40} = vdata;
- let Inst{52-48} = srsrc{6-2};
- let Inst{54} = slc;
- let Inst{55} = tfe;
- let Inst{63-56} = soffset;
-}
-
class MIMGe <bits<7> op> : Enc64 {
bits<8> vdata;
bits<4> dmask;
@@ -600,26 +215,6 @@ class MIMGe <bits<7> op> : Enc64 {
let Inst{57-53} = ssamp{6-2};
}
-class FLATe<bits<7> op> : Enc64 {
- bits<8> addr;
- bits<8> data;
- bits<8> vdst;
- bits<1> slc;
- bits<1> glc;
- bits<1> tfe;
-
- // 15-0 is reserved.
- let Inst{16} = glc;
- let Inst{17} = slc;
- let Inst{24-18} = op;
- let Inst{31-26} = 0x37; // Encoding.
- let Inst{39-32} = addr;
- let Inst{47-40} = data;
- // 54-48 is reserved.
- let Inst{55} = tfe;
- let Inst{63-56} = vdst;
-}
-
class EXPe : Enc64 {
bits<4> en;
bits<6> tgt;
@@ -645,92 +240,37 @@ class EXPe : Enc64 {
let Uses = [EXEC] in {
-class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
- VOP1Common <outs, ins, asm, pattern>,
- VOP1e<op> {
- let isCodeGenOnly = 0;
-}
-
-class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
- VOP2Common <outs, ins, asm, pattern>, VOP2e<op> {
- let isCodeGenOnly = 0;
-}
-
-class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
- VOPCCommon <ins, asm, pattern>, VOPCe <op>;
-
class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI <outs, ins, asm, pattern> {
- let mayLoad = 1;
+ let VINTRP = 1;
+ // VINTRP instructions read parameter values from LDS, but these parameter
+ // values are stored outside of the LDS memory that is allocated to the
+ // shader for general purpose use.
+ //
+ // While it may be possible for ds_read/ds_write instructions to access
+ // the parameter values in LDS, this would essentially be an out-of-bounds
+ // memory access which we consider to be undefined behavior.
+ //
+ // So even though these instructions read memory, this memory is outside the
+ // addressable memory space for the shader, and we consider these instructions
+ // to be readnone.
+ let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
}
-} // End Uses = [EXEC]
-
-//===----------------------------------------------------------------------===//
-// Vector I/O operations
-//===----------------------------------------------------------------------===//
-
-class DS <dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern> {
-
- let LGKM_CNT = 1;
- let DS = 1;
- let UseNamedOperandTable = 1;
- let Uses = [M0, EXEC];
-
- // Most instruction load and store data, so set this as the default.
- let mayLoad = 1;
- let mayStore = 1;
-
- let hasSideEffects = 0;
- let AsmMatchConverter = "cvtDS";
- let SchedRW = [WriteLDS];
-}
-
-class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI<outs, ins, asm, pattern> {
-
- let VM_CNT = 1;
+class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI<outs, ins, asm, pattern> {
+ let EXP = 1;
let EXP_CNT = 1;
- let MUBUF = 1;
- let Uses = [EXEC];
-
- let hasSideEffects = 0;
+ let mayLoad = 0; // Set to 1 if done bit is set.
+ let mayStore = 1;
let UseNamedOperandTable = 1;
- let AsmMatchConverter = "cvtMubuf";
- let SchedRW = [WriteVMEM];
-}
-
-class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI<outs, ins, asm, pattern> {
-
- let VM_CNT = 1;
- let EXP_CNT = 1;
- let MTBUF = 1;
let Uses = [EXEC];
-
- let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
- let SchedRW = [WriteVMEM];
+ let SchedRW = [WriteExport];
}
-class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI<outs, ins, asm, pattern>, FLATe <op> {
- let FLAT = 1;
- // Internally, FLAT instruction are executed as both an LDS and a
- // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
- // and are not considered done until both have been decremented.
- let VM_CNT = 1;
- let LGKM_CNT = 1;
-
- let Uses = [EXEC, FLAT_SCR]; // M0
-
- let UseNamedOperandTable = 1;
- let hasSideEffects = 0;
- let SchedRW = [WriteVMEM];
-}
+} // End Uses = [EXEC]
class MIMG <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI <outs, ins, asm, pattern> {
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9190819..26a8d22 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -28,6 +28,13 @@
using namespace llvm;
+// Must be at least 4 to be able to branch over minimum unconditional branch
+// code. This is only for making it possible to write reasonably small tests for
+// long branches.
+static cl::opt<unsigned>
+BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
+ cl::desc("Restrict range of branch instructions (DEBUG)"));
+
SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
: AMDGPUInstrInfo(ST), RI(), ST(ST) {}
@@ -258,7 +265,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
}
if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
- if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
+ const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
+ if (SOffset && SOffset->isReg())
return false;
const MachineOperand *AddrReg =
@@ -270,6 +278,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
getNamedOperand(LdSt, AMDGPU::OpName::offset);
BaseReg = AddrReg->getReg();
Offset = OffsetImm->getImm();
+
+ if (SOffset) // soffset can be an inline immediate.
+ Offset += SOffset->getImm();
+
return true;
}
@@ -287,7 +299,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
}
if (isFLAT(LdSt)) {
- const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr);
+ const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
BaseReg = AddrReg->getReg();
Offset = 0;
return true;
@@ -302,20 +314,16 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
const MachineOperand *FirstDst = nullptr;
const MachineOperand *SecondDst = nullptr;
- if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
- }
-
- if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
- }
-
if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
(isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) {
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
+ } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
+ FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
+ SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
+ } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
+ FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
+ SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
}
if (!FirstDst || !SecondDst)
@@ -342,62 +350,32 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg,
unsigned SrcReg, bool KillSrc) const {
+ const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
- // If we are trying to copy to or from SCC, there is a bug somewhere else in
- // the backend. While it may be theoretically possible to do this, it should
- // never be necessary.
- assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
-
- static const int16_t Sub0_15[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
- AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
- AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
- AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
- };
-
- static const int16_t Sub0_15_64[] = {
- AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
- AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
- AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
- AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
- };
-
- static const int16_t Sub0_7[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
- AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
- };
-
- static const int16_t Sub0_7_64[] = {
- AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
- AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
- };
-
- static const int16_t Sub0_3[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
- };
-
- static const int16_t Sub0_3_64[] = {
- AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
- };
-
- static const int16_t Sub0_2[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
- };
-
- static const int16_t Sub0_1[] = {
- AMDGPU::sub0, AMDGPU::sub1,
- };
+ if (RC == &AMDGPU::VGPR_32RegClass) {
+ assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
+ AMDGPU::SReg_32RegClass.contains(SrcReg));
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
- unsigned Opcode;
- ArrayRef<int16_t> SubIndices;
+ if (RC == &AMDGPU::SReg_32_XM0RegClass ||
+ RC == &AMDGPU::SReg_32RegClass) {
+ if (SrcReg == AMDGPU::SCC) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
+ .addImm(-1)
+ .addImm(0);
+ return;
+ }
- if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
+ }
- } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
+ if (RC == &AMDGPU::SReg_64RegClass) {
if (DestReg == AMDGPU::VCC) {
if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
@@ -405,7 +383,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
} else {
// FIXME: Hack until VReg_1 removed.
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
- BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32))
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
.addImm(0)
.addReg(SrcReg, getKillRegState(KillSrc));
}
@@ -417,62 +395,29 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
+ }
- } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
- assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
- Opcode = AMDGPU::S_MOV_B64;
- SubIndices = Sub0_3_64;
-
- } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
- assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
- Opcode = AMDGPU::S_MOV_B64;
- SubIndices = Sub0_7_64;
-
- } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
- assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
- Opcode = AMDGPU::S_MOV_B64;
- SubIndices = Sub0_15_64;
-
- } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
- assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
- AMDGPU::SReg_32RegClass.contains(SrcReg));
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (DestReg == AMDGPU::SCC) {
+ assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0);
return;
+ }
- } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
- assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
- AMDGPU::SReg_64RegClass.contains(SrcReg));
- Opcode = AMDGPU::V_MOV_B32_e32;
- SubIndices = Sub0_1;
-
- } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) {
- assert(AMDGPU::VReg_96RegClass.contains(SrcReg));
- Opcode = AMDGPU::V_MOV_B32_e32;
- SubIndices = Sub0_2;
-
- } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
- assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
- AMDGPU::SReg_128RegClass.contains(SrcReg));
- Opcode = AMDGPU::V_MOV_B32_e32;
- SubIndices = Sub0_3;
-
- } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
- assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
- AMDGPU::SReg_256RegClass.contains(SrcReg));
- Opcode = AMDGPU::V_MOV_B32_e32;
- SubIndices = Sub0_7;
-
- } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
- assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
- AMDGPU::SReg_512RegClass.contains(SrcReg));
- Opcode = AMDGPU::V_MOV_B32_e32;
- SubIndices = Sub0_15;
-
- } else {
- llvm_unreachable("Can't copy register!");
+ unsigned EltSize = 4;
+ unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+ if (RI.isSGPRClass(RC)) {
+ if (RC->getSize() > 4) {
+ Opcode = AMDGPU::S_MOV_B64;
+ EltSize = 8;
+ } else {
+ Opcode = AMDGPU::S_MOV_B32;
+ EltSize = 4;
+ }
}
+ ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
@@ -497,9 +442,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
}
-int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const {
- const unsigned Opcode = MI.getOpcode();
-
+int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
int NewOpc;
// Try to map original to commuted opcode
@@ -573,11 +516,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
- unsigned Size = FrameInfo->getObjectSize(FrameIndex);
- unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
+ unsigned Size = FrameInfo.getObjectSize(FrameIndex);
+ unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
MachineMemOperand *MMO
@@ -587,20 +530,31 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
if (RI.isSGPRClass(RC)) {
MFI->setHasSpilledSGPRs();
+ // We are only allowed to create one new instruction when spilling
+ // registers, so we need to use pseudo instruction for spilling SGPRs.
+ const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize()));
+
+ // The SGPR spill/restore instructions only work on number sgprs, so we need
+ // to make sure we are using the correct register class.
if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) {
- // m0 may not be allowed for readlane.
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
}
- // We are only allowed to create one new instruction when spilling
- // registers, so we need to use pseudo instruction for spilling
- // SGPRs.
- unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize());
- BuildMI(MBB, MI, DL, get(Opcode))
- .addReg(SrcReg, getKillRegState(isKill)) // src
- .addFrameIndex(FrameIndex) // frame_idx
- .addMemOperand(MMO);
+ MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
+ .addReg(SrcReg, getKillRegState(isKill)) // data
+ .addFrameIndex(FrameIndex) // addr
+ .addMemOperand(MMO)
+ .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+ .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+ // Add the scratch resource registers as implicit uses because we may end up
+ // needing them, and need to ensure that the reserved registers are
+ // correctly handled.
+
+ if (ST.hasScalarStores()) {
+ // m0 is used for offset to scalar stores if used to spill.
+ Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+ }
return;
}
@@ -620,11 +574,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
MFI->setHasSpilledVGPRs();
BuildMI(MBB, MI, DL, get(Opcode))
- .addReg(SrcReg, getKillRegState(isKill)) // src
- .addFrameIndex(FrameIndex) // frame_idx
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
- .addImm(0) // offset
+ .addReg(SrcReg, getKillRegState(isKill)) // data
+ .addFrameIndex(FrameIndex) // addr
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addImm(0) // offset
.addMemOperand(MMO);
}
@@ -671,10 +625,10 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
- unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
- unsigned Size = FrameInfo->getObjectSize(FrameIndex);
+ unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
+ unsigned Size = FrameInfo.getObjectSize(FrameIndex);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
@@ -685,17 +639,22 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
if (RI.isSGPRClass(RC)) {
// FIXME: Maybe this should not include a memoperand because it will be
// lowered to non-memory instructions.
- unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize());
-
+ const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize()));
if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) {
- // m0 may not be allowed for readlane.
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
- BuildMI(MBB, MI, DL, get(Opcode), DestReg)
- .addFrameIndex(FrameIndex) // frame_idx
- .addMemOperand(MMO);
+ MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
+ .addFrameIndex(FrameIndex) // addr
+ .addMemOperand(MMO)
+ .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+ .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+
+ if (ST.hasScalarStores()) {
+ // m0 is used for offset to scalar stores if used to spill.
+ Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+ }
return;
}
@@ -713,7 +672,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
- .addFrameIndex(FrameIndex) // frame_idx
+ .addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
.addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
.addImm(0) // offset
@@ -729,7 +688,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
- unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);
+ unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
unsigned WavefrontSize = ST.getWavefrontSize();
unsigned TIDReg = MFI->getTIDReg();
@@ -808,7 +767,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
}
// Add FrameIndex to LDS offset
- unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize);
+ unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
.addImm(LDSOffset)
.addReg(TIDReg);
@@ -851,7 +810,24 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
DebugLoc DL = MBB.findDebugLoc(MI);
switch (MI.getOpcode()) {
default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
-
+ case AMDGPU::S_MOV_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_MOV_B64));
+ break;
+ }
+ case AMDGPU::S_XOR_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_XOR_B64));
+ break;
+ }
+ case AMDGPU::S_ANDN2_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_ANDN2_B64));
+ break;
+ }
case AMDGPU::V_MOV_B64_PSEUDO: {
unsigned Dst = MI.getOperand(0).getReg();
unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -880,36 +856,37 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
+ case AMDGPU::V_MOVRELD_B32_V1:
+ case AMDGPU::V_MOVRELD_B32_V2:
+ case AMDGPU::V_MOVRELD_B32_V4:
+ case AMDGPU::V_MOVRELD_B32_V8:
+ case AMDGPU::V_MOVRELD_B32_V16: {
+ const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
+ unsigned VecReg = MI.getOperand(0).getReg();
+ bool IsUndef = MI.getOperand(1).isUndef();
+ unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
+ assert(VecReg == MI.getOperand(1).getReg());
+
+ MachineInstr *MovRel =
+ BuildMI(MBB, MI, DL, MovRelDesc)
+ .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
+ .addOperand(MI.getOperand(2))
+ .addReg(VecReg, RegState::ImplicitDefine)
+ .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+
+ const int ImpDefIdx =
+ MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
+ const int ImpUseIdx = ImpDefIdx + 1;
+ MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
- case AMDGPU::V_CNDMASK_B64_PSEUDO: {
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
- unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
- unsigned Src0 = MI.getOperand(1).getReg();
- unsigned Src1 = MI.getOperand(2).getReg();
- const MachineOperand &SrcCond = MI.getOperand(3);
-
- BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
- .addReg(RI.getSubReg(Src0, AMDGPU::sub0))
- .addReg(RI.getSubReg(Src1, AMDGPU::sub0))
- .addReg(SrcCond.getReg())
- .addReg(Dst, RegState::Implicit | RegState::Define);
- BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
- .addReg(RI.getSubReg(Src0, AMDGPU::sub1))
- .addReg(RI.getSubReg(Src1, AMDGPU::sub1))
- .addReg(SrcCond.getReg(), getKillRegState(SrcCond.isKill()))
- .addReg(Dst, RegState::Implicit | RegState::Define);
MI.eraseFromParent();
break;
}
-
case AMDGPU::SI_PC_ADD_REL_OFFSET: {
- const SIRegisterInfo *TRI
- = static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
MachineFunction &MF = *MBB.getParent();
unsigned Reg = MI.getOperand(0).getReg();
- unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0);
- unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1);
+ unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
+ unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
// Create a bundle so these instructions won't be re-ordered by the
// post-RA scheduler.
@@ -921,10 +898,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
.addReg(RegLo)
.addOperand(MI.getOperand(1)));
- Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
- .addReg(RegHi)
- .addImm(0));
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+ .addReg(RegHi);
+ if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
+ MIB.addImm(0);
+ else
+ MIB.addOperand(MI.getOperand(2));
+
+ Bundler.append(MIB);
llvm::finalizeBundle(MBB, Bundler.begin());
MI.eraseFromParent();
@@ -934,91 +916,96 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
-/// Commutes the operands in the given instruction.
-/// The commutable operands are specified by their indices OpIdx0 and OpIdx1.
-///
-/// Do not call this method for a non-commutable instruction or for
-/// non-commutable pair of operand indices OpIdx0 and OpIdx1.
-/// Even though the instruction is commutable, the method may still
-/// fail to commute the operands, null pointer is returned in such cases.
-MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
- unsigned OpIdx0,
- unsigned OpIdx1) const {
- int CommutedOpcode = commuteOpcode(MI);
- if (CommutedOpcode == -1)
- return nullptr;
+bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
+ MachineOperand &Src0,
+ unsigned Src0OpName,
+ MachineOperand &Src1,
+ unsigned Src1OpName) const {
+ MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
+ if (!Src0Mods)
+ return false;
- int Src0Idx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
- MachineOperand &Src0 = MI.getOperand(Src0Idx);
- if (!Src0.isReg())
+ MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
+ assert(Src1Mods &&
+ "All commutable instructions have both src0 and src1 modifiers");
+
+ int Src0ModsVal = Src0Mods->getImm();
+ int Src1ModsVal = Src1Mods->getImm();
+
+ Src1Mods->setImm(Src0ModsVal);
+ Src0Mods->setImm(Src1ModsVal);
+ return true;
+}
+
+static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
+ MachineOperand &RegOp,
+ MachineOperand &NonRegOp) {
+ unsigned Reg = RegOp.getReg();
+ unsigned SubReg = RegOp.getSubReg();
+ bool IsKill = RegOp.isKill();
+ bool IsDead = RegOp.isDead();
+ bool IsUndef = RegOp.isUndef();
+ bool IsDebug = RegOp.isDebug();
+
+ if (NonRegOp.isImm())
+ RegOp.ChangeToImmediate(NonRegOp.getImm());
+ else if (NonRegOp.isFI())
+ RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
+ else
return nullptr;
- int Src1Idx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+ NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
+ NonRegOp.setSubReg(SubReg);
- if ((OpIdx0 != static_cast<unsigned>(Src0Idx) ||
- OpIdx1 != static_cast<unsigned>(Src1Idx)) &&
- (OpIdx0 != static_cast<unsigned>(Src1Idx) ||
- OpIdx1 != static_cast<unsigned>(Src0Idx)))
+ return &MI;
+}
+
+MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned Src0Idx,
+ unsigned Src1Idx) const {
+ assert(!NewMI && "this should never be used");
+
+ unsigned Opc = MI.getOpcode();
+ int CommutedOpcode = commuteOpcode(Opc);
+ if (CommutedOpcode == -1)
return nullptr;
- MachineOperand &Src1 = MI.getOperand(Src1Idx);
+ assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
+ static_cast<int>(Src0Idx) &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
+ static_cast<int>(Src1Idx) &&
+ "inconsistency with findCommutedOpIndices");
- if (isVOP2(MI) || isVOPC(MI)) {
- const MCInstrDesc &InstrDesc = MI.getDesc();
- // For VOP2 and VOPC instructions, any operand type is valid to use for
- // src0. Make sure we can use the src0 as src1.
- //
- // We could be stricter here and only allow commuting if there is a reason
- // to do so. i.e. if both operands are VGPRs there is no real benefit,
- // although MachineCSE attempts to find matches by commuting.
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
- if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0))
- return nullptr;
- }
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ MachineOperand &Src1 = MI.getOperand(Src1Idx);
- MachineInstr *CommutedMI = &MI;
- if (!Src1.isReg()) {
- // Allow commuting instructions with Imm operands.
- if (NewMI || !Src1.isImm() || (!isVOP2(MI) && !isVOP3(MI))) {
- return nullptr;
+ MachineInstr *CommutedMI = nullptr;
+ if (Src0.isReg() && Src1.isReg()) {
+ if (isOperandLegal(MI, Src1Idx, &Src0)) {
+ // Be sure to copy the source modifiers to the right place.
+ CommutedMI
+ = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
}
- // Be sure to copy the source modifiers to the right place.
- if (MachineOperand *Src0Mods =
- getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) {
- MachineOperand *Src1Mods =
- getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
-
- int Src0ModsVal = Src0Mods->getImm();
- if (!Src1Mods && Src0ModsVal != 0)
- return nullptr;
-
- // XXX - This assert might be a lie. It might be useful to have a neg
- // modifier with 0.0.
- int Src1ModsVal = Src1Mods->getImm();
- assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates");
-
- Src1Mods->setImm(Src0ModsVal);
- Src0Mods->setImm(Src1ModsVal);
- }
-
- unsigned Reg = Src0.getReg();
- unsigned SubReg = Src0.getSubReg();
- if (Src1.isImm())
- Src0.ChangeToImmediate(Src1.getImm());
- else
- llvm_unreachable("Should only have immediates");
- Src1.ChangeToRegister(Reg, false);
- Src1.setSubReg(SubReg);
+ } else if (Src0.isReg() && !Src1.isReg()) {
+ // src0 should always be able to support any operand type, so no need to
+ // check operand legality.
+ CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
+ } else if (!Src0.isReg() && Src1.isReg()) {
+ if (isOperandLegal(MI, Src1Idx, &Src0))
+ CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
} else {
- CommutedMI =
- TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1);
+ // FIXME: Found two non registers to commute. This does happen.
+ return nullptr;
}
- if (CommutedMI)
+
+ if (CommutedMI) {
+ swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
+ Src1, AMDGPU::OpName::src1_modifiers);
+
CommutedMI->setDesc(get(CommutedOpcode));
+ }
return CommutedMI;
}
@@ -1028,8 +1015,7 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
// TargetInstrInfo::commuteInstruction uses it.
bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
unsigned &SrcOpIdx1) const {
- const MCInstrDesc &MCID = MI.getDesc();
- if (!MCID.isCommutable())
+ if (!MI.isCommutable())
return false;
unsigned Opc = MI.getOpcode();
@@ -1037,34 +1023,135 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
if (Src0Idx == -1)
return false;
- // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
- // immediate. Also, immediate src0 operand is not handled in
- // SIInstrInfo::commuteInstruction();
- if (!MI.getOperand(Src0Idx).isReg())
- return false;
-
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (Src1Idx == -1)
return false;
- MachineOperand &Src1 = MI.getOperand(Src1Idx);
- if (Src1.isImm()) {
- // SIInstrInfo::commuteInstruction() does support commuting the immediate
- // operand src1 in 2 and 3 operand instructions.
- if (!isVOP2(MI.getOpcode()) && !isVOP3(MI.getOpcode()))
- return false;
- } else if (Src1.isReg()) {
- // If any source modifiers are set, the generic instruction commuting won't
- // understand how to copy the source modifiers.
- if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
- hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))
- return false;
- } else
- return false;
-
return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
}
+bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
+ int64_t BrOffset) const {
+ // BranchRelaxation should never have to check s_setpc_b64 because its dest
+ // block is unanalyzable.
+ assert(BranchOp != AMDGPU::S_SETPC_B64);
+
+ // Convert to dwords.
+ BrOffset /= 4;
+
+ // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
+ // from the next instruction.
+ BrOffset -= 1;
+
+ return isIntN(BranchOffsetBits, BrOffset);
+}
+
+MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
+ const MachineInstr &MI) const {
+ if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
+ // This would be a difficult analysis to perform, but can always be legal so
+ // there's no need to analyze it.
+ return nullptr;
+ }
+
+ return MI.getOperand(0).getMBB();
+}
+
+unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock &DestBB,
+ const DebugLoc &DL,
+ int64_t BrOffset,
+ RegScavenger *RS) const {
+ assert(RS && "RegScavenger required for long branching");
+ assert(MBB.empty() &&
+ "new block should be inserted for expanding unconditional branch");
+ assert(MBB.pred_size() == 1);
+
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // FIXME: Virtual register workaround for RegScavenger not working with empty
+ // blocks.
+ unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ auto I = MBB.end();
+
+ // We need to compute the offset relative to the instruction immediately after
+ // s_getpc_b64. Insert pc arithmetic code before last terminator.
+ MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
+
+ // TODO: Handle > 32-bit block address.
+ if (BrOffset >= 0) {
+ BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
+ .addReg(PCReg, RegState::Define, AMDGPU::sub0)
+ .addReg(PCReg, 0, AMDGPU::sub0)
+ .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
+ .addReg(PCReg, RegState::Define, AMDGPU::sub1)
+ .addReg(PCReg, 0, AMDGPU::sub1)
+ .addImm(0);
+ } else {
+ // Backwards branch.
+ BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
+ .addReg(PCReg, RegState::Define, AMDGPU::sub0)
+ .addReg(PCReg, 0, AMDGPU::sub0)
+ .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
+ .addReg(PCReg, RegState::Define, AMDGPU::sub1)
+ .addReg(PCReg, 0, AMDGPU::sub1)
+ .addImm(0);
+ }
+
+ // Insert the indirect branch after the other terminator.
+ BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
+ .addReg(PCReg);
+
+ // FIXME: If spilling is necessary, this will fail because this scavenger has
+ // no emergency stack slots. It is non-trivial to spill in this situation,
+ // because the restore code needs to be specially placed after the
+ // jump. BranchRelaxation then needs to be made aware of the newly inserted
+ // block.
+ //
+ // If a spill is needed for the pc register pair, we need to insert a spill
+ // restore block right before the destination block, and insert a short branch
+ // into the old destination block's fallthrough predecessor.
+ // e.g.:
+ //
+ // s_cbranch_scc0 skip_long_branch:
+ //
+ // long_branch_bb:
+ // spill s[8:9]
+ // s_getpc_b64 s[8:9]
+ // s_add_u32 s8, s8, restore_bb
+ // s_addc_u32 s9, s9, 0
+ // s_setpc_b64 s[8:9]
+ //
+ // skip_long_branch:
+ // foo;
+ //
+ // .....
+ //
+ // dest_bb_fallthrough_predecessor:
+ // bar;
+ // s_branch dest_bb
+ //
+ // restore_bb:
+ // restore s[8:9]
+ // fallthrough dest_bb
+ ///
+ // dest_bb:
+ // buzz;
+
+ RS->enterBasicBlockEnd(MBB);
+ unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
+ MachineBasicBlock::iterator(GetPC), 0);
+ MRI.replaceRegWith(PCReg, Scav);
+ MRI.clearVirtRegs();
+ RS->setRegUsed(Scav);
+
+ return 4 + 8 + 4 + 4;
+}
+
unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
switch (Cond) {
case SIInstrInfo::SCC_TRUE:
@@ -1103,15 +1190,12 @@ SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
}
}
-bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
- MachineBasicBlock *&FBB,
- SmallVectorImpl<MachineOperand> &Cond,
- bool AllowModify) const {
- MachineBasicBlock::iterator I = MBB.getFirstTerminator();
-
- if (I == MBB.end())
- return false;
-
+bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
if (I->getOpcode() == AMDGPU::S_BRANCH) {
// Unconditional Branch
TBB = I->getOperand(0).getMBB();
@@ -1124,6 +1208,7 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
Cond.push_back(MachineOperand::CreateImm(Pred));
+ Cond.push_back(I->getOperand(1)); // Save the branch register.
++I;
@@ -1142,29 +1227,81 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
return true;
}
-unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+ if (I == MBB.end())
+ return false;
+
+ if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
+ return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
+
+ ++I;
+
+ // TODO: Should be able to treat as fallthrough?
+ if (I == MBB.end())
+ return true;
+
+ if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
+ return true;
+
+ MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
+
+ // Specifically handle the case where the conditional branch is to the same
+ // destination as the mask branch. e.g.
+ //
+ // si_mask_branch BB8
+ // s_cbranch_execz BB8
+ // s_cbranch BB9
+ //
+ // This is required to understand divergent loops which may need the branches
+ // to be relaxed.
+ if (TBB != MaskBrDest || Cond.empty())
+ return true;
+
+ auto Pred = Cond[0].getImm();
+ return (Pred != EXECZ && Pred != EXECNZ);
+}
+
+unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
MachineBasicBlock::iterator I = MBB.getFirstTerminator();
unsigned Count = 0;
+ unsigned RemovedSize = 0;
while (I != MBB.end()) {
MachineBasicBlock::iterator Next = std::next(I);
+ if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+ I = Next;
+ continue;
+ }
+
+ RemovedSize += getInstSizeInBytes(*I);
I->eraseFromParent();
++Count;
I = Next;
}
+ if (BytesRemoved)
+ *BytesRemoved = RemovedSize;
+
return Count;
}
-unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
MachineBasicBlock *TBB,
MachineBasicBlock *FBB,
ArrayRef<MachineOperand> Cond,
- const DebugLoc &DL) const {
+ const DebugLoc &DL,
+ int *BytesAdded) const {
if (!FBB && Cond.empty()) {
BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
.addMBB(TBB);
+ if (BytesAdded)
+ *BytesAdded = 4;
return 1;
}
@@ -1174,24 +1311,42 @@ unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB,
= getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
if (!FBB) {
- BuildMI(&MBB, DL, get(Opcode))
+ Cond[1].isUndef();
+ MachineInstr *CondBr =
+ BuildMI(&MBB, DL, get(Opcode))
.addMBB(TBB);
+
+ // Copy the flags onto the implicit condition register operand.
+ MachineOperand &CondReg = CondBr->getOperand(1);
+ CondReg.setIsUndef(Cond[1].isUndef());
+ CondReg.setIsKill(Cond[1].isKill());
+
+ if (BytesAdded)
+ *BytesAdded = 4;
return 1;
}
assert(TBB && FBB);
- BuildMI(&MBB, DL, get(Opcode))
+ MachineInstr *CondBr =
+ BuildMI(&MBB, DL, get(Opcode))
.addMBB(TBB);
BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
.addMBB(FBB);
+ MachineOperand &CondReg = CondBr->getOperand(1);
+ CondReg.setIsUndef(Cond[1].isUndef());
+ CondReg.setIsKill(Cond[1].isKill());
+
+ if (BytesAdded)
+ *BytesAdded = 8;
+
return 2;
}
-bool SIInstrInfo::ReverseBranchCondition(
+bool SIInstrInfo::reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const {
- assert(Cond.size() == 1);
+ assert(Cond.size() == 2);
Cond[0].setImm(-Cond[0].getImm());
return false;
}
@@ -1210,15 +1365,43 @@ static void removeModOperands(MachineInstr &MI) {
MI.RemoveOperand(Src0ModIdx);
}
-// TODO: Maybe this should be removed this and custom fold everything in
-// SIFoldOperands?
bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned Reg, MachineRegisterInfo *MRI) const {
if (!MRI->hasOneNonDBGUse(Reg))
return false;
unsigned Opc = UseMI.getOpcode();
- if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) {
+ if (Opc == AMDGPU::COPY) {
+ bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
+ switch (DefMI.getOpcode()) {
+ default:
+ return false;
+ case AMDGPU::S_MOV_B64:
+ // TODO: We could fold 64-bit immediates, but this get compilicated
+ // when there are sub-registers.
+ return false;
+
+ case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::S_MOV_B32:
+ break;
+ }
+ unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
+ const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
+ assert(ImmOp);
+ // FIXME: We could handle FrameIndex values here.
+ if (!ImmOp->isImm()) {
+ return false;
+ }
+ UseMI.setDesc(get(NewOpc));
+ UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
+ UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
+ return true;
+ }
+
+ if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
+ bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
+
// Don't fold if we are using source modifiers. The new VOP2 instructions
// don't have them.
if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) ||
@@ -1232,14 +1415,16 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// If this is a free constant, there's no reason to do this.
// TODO: We could fold this here instead of letting SIFoldOperands do it
// later.
- if (isInlineConstant(ImmOp, 4))
+ MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
+
+ // Any src operand can be used for the legality check.
+ if (isInlineConstant(UseMI, *Src0, ImmOp))
return false;
- MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
- // Multiplied part is the constant: Use v_madmk_f32
+ // Multiplied part is the constant: Use v_madmk_{f16, f32}.
// We should only expect these to be on src0 due to canonicalizations.
if (Src0->isReg() && Src0->getReg() == Reg) {
if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
@@ -1267,15 +1452,15 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setSubReg(Src1SubReg);
Src0->setIsKill(Src1->isKill());
- if (Opc == AMDGPU::V_MAC_F32_e64) {
+ if (Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_MAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
- }
Src1->ChangeToImmediate(Imm);
removeModOperands(UseMI);
- UseMI.setDesc(get(AMDGPU::V_MADMK_F32));
+ UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
@@ -1284,7 +1469,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
return true;
}
- // Added part is the constant: Use v_madak_f32
+ // Added part is the constant: Use v_madak_{f16, f32}.
if (Src2->isReg() && Src2->getReg() == Reg) {
// Not allowed to use constant bus for another operand.
// We can however allow an inline immediate as src0.
@@ -1306,17 +1491,17 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
UseMI.RemoveOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
- if (Opc == AMDGPU::V_MAC_F32_e64) {
+ if (Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_MAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
- }
// ChangingToImmediate adds Src2 back to the instruction.
Src2->ChangeToImmediate(Imm);
// These come before src2.
removeModOperands(UseMI);
- UseMI.setDesc(get(AMDGPU::V_MADAK_F32));
+ UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
@@ -1375,6 +1560,17 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
return false;
+ if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
+ const MachineMemOperand *MMOa = *MIa.memoperands_begin();
+ const MachineMemOperand *MMOb = *MIb.memoperands_begin();
+ if (MMOa->getValue() && MMOb->getValue()) {
+ MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
+ MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
+ if (!AA->alias(LocA, LocB))
+ return true;
+ }
+ }
+
// TODO: Should we check the address space from the MachineMemOperand? That
// would allow us to distinguish objects we know don't alias based on the
// underlying address space, even if it was lowered to a different one,
@@ -1414,15 +1610,22 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
MachineInstr &MI,
LiveVariables *LV) const {
+ bool IsF16 = false;
switch (MI.getOpcode()) {
default:
return nullptr;
+ case AMDGPU::V_MAC_F16_e64:
+ IsF16 = true;
case AMDGPU::V_MAC_F32_e64:
break;
+ case AMDGPU::V_MAC_F16_e32:
+ IsF16 = true;
case AMDGPU::V_MAC_F32_e32: {
- const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
- if (Src0->isImm() && !isInlineConstant(*Src0, 4))
+ int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::src0);
+ const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
+ if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
return nullptr;
break;
}
@@ -1433,7 +1636,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
- return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32))
+ return BuildMI(*MBB, MI, MI.getDebugLoc(),
+ get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
.addOperand(*Dst)
.addImm(0) // Src0 mods
.addOperand(*Src0)
@@ -1445,6 +1649,20 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
.addImm(0); // omod
}
+// It's not generally safe to move VALU instructions across these since it will
+// start using the register as a base index rather than directly.
+// XXX - Why isn't hasSideEffects sufficient for these?
+static bool changesVGPRIndexingMode(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_SET_GPR_IDX_ON:
+ case AMDGPU::S_SET_GPR_IDX_MODE:
+ case AMDGPU::S_SET_GPR_IDX_OFF:
+ return true;
+ default:
+ return false;
+ }
+}
+
bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
const MachineBasicBlock *MBB,
const MachineFunction &MF) const {
@@ -1454,67 +1672,78 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
// when they operate on VGPRs. Treating EXEC modifications as scheduling
// boundaries prevents incorrect movements of such instructions.
return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
- MI.modifiesRegister(AMDGPU::EXEC, &RI);
+ MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
+ MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
+ MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
+ changesVGPRIndexingMode(MI);
}
bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
- int64_t SVal = Imm.getSExtValue();
- if (SVal >= -16 && SVal <= 64)
- return true;
-
- if (Imm.getBitWidth() == 64) {
- uint64_t Val = Imm.getZExtValue();
- return (DoubleToBits(0.0) == Val) ||
- (DoubleToBits(1.0) == Val) ||
- (DoubleToBits(-1.0) == Val) ||
- (DoubleToBits(0.5) == Val) ||
- (DoubleToBits(-0.5) == Val) ||
- (DoubleToBits(2.0) == Val) ||
- (DoubleToBits(-2.0) == Val) ||
- (DoubleToBits(4.0) == Val) ||
- (DoubleToBits(-4.0) == Val);
- }
-
- // The actual type of the operand does not seem to matter as long
- // as the bits match one of the inline immediate values. For example:
- //
- // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
- // so it is a legal inline immediate.
- //
- // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
- // floating-point, so it is a legal inline immediate.
- uint32_t Val = Imm.getZExtValue();
-
- return (FloatToBits(0.0f) == Val) ||
- (FloatToBits(1.0f) == Val) ||
- (FloatToBits(-1.0f) == Val) ||
- (FloatToBits(0.5f) == Val) ||
- (FloatToBits(-0.5f) == Val) ||
- (FloatToBits(2.0f) == Val) ||
- (FloatToBits(-2.0f) == Val) ||
- (FloatToBits(4.0f) == Val) ||
- (FloatToBits(-4.0f) == Val);
+ switch (Imm.getBitWidth()) {
+ case 32:
+ return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
+ ST.hasInv2PiInlineImm());
+ case 64:
+ return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
+ ST.hasInv2PiInlineImm());
+ case 16:
+ return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
+ ST.hasInv2PiInlineImm());
+ default:
+ llvm_unreachable("invalid bitwidth");
+ }
}
bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
- unsigned OpSize) const {
- if (MO.isImm()) {
- // MachineOperand provides no way to tell the true operand size, since it
- // only records a 64-bit value. We need to know the size to determine if a
- // 32-bit floating point immediate bit pattern is legal for an integer
- // immediate. It would be for any 32-bit integer operand, but would not be
- // for a 64-bit one.
+ uint8_t OperandType) const {
+ if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
+ return false;
- unsigned BitSize = 8 * OpSize;
- return isInlineConstant(APInt(BitSize, MO.getImm(), true));
- }
+ // MachineOperand provides no way to tell the true operand size, since it only
+ // records a 64-bit value. We need to know the size to determine if a 32-bit
+ // floating point immediate bit pattern is legal for an integer immediate. It
+ // would be for any 32-bit integer operand, but would not be for a 64-bit one.
+
+ int64_t Imm = MO.getImm();
+ switch (operandBitWidth(OperandType)) {
+ case 32: {
+ int32_t Trunc = static_cast<int32_t>(Imm);
+ return Trunc == Imm &&
+ AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
+ }
+ case 64: {
+ return AMDGPU::isInlinableLiteral64(MO.getImm(),
+ ST.hasInv2PiInlineImm());
+ }
+ case 16: {
+ if (isInt<16>(Imm) || isUInt<16>(Imm)) {
+ int16_t Trunc = static_cast<int16_t>(Imm);
+ return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+ }
- return false;
+ return false;
+ }
+ default:
+ llvm_unreachable("invalid bitwidth");
+ }
}
-bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO,
- unsigned OpSize) const {
- return MO.isImm() && !isInlineConstant(MO, OpSize);
+bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
+ const MCOperandInfo &OpInfo) const {
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ return false;
+ case MachineOperand::MO_Immediate:
+ return !isInlineConstant(MO, OpInfo);
+ case MachineOperand::MO_FrameIndex:
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_MCSymbol:
+ return true;
+ default:
+ llvm_unreachable("unexpected operand type");
+ }
}
static bool compareMachineOp(const MachineOperand &Op0,
@@ -1544,11 +1773,10 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
if (OpInfo.RegClass < 0)
return false;
- unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize();
- if (isLiteralConstant(MO, OpSize))
- return RI.opCanUseLiteralConstant(OpInfo.OperandType);
+ if (MO.isImm() && isInlineConstant(MO, OpInfo))
+ return RI.opCanUseInlineConstant(OpInfo.OperandType);
- return RI.opCanUseInlineConstant(OpInfo.OperandType);
+ return RI.opCanUseLiteralConstant(OpInfo.OperandType);
}
bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
@@ -1575,12 +1803,17 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
const MachineOperand &MO,
- unsigned OpSize) const {
+ const MCOperandInfo &OpInfo) const {
// Literal constants use the constant bus.
- if (isLiteralConstant(MO, OpSize))
- return true;
+ //if (isLiteralConstantLike(MO, OpInfo))
+ // return true;
+ if (MO.isImm())
+ return !isInlineConstant(MO, OpInfo);
+
+ if (!MO.isReg())
+ return true; // Misc other operands like FrameIndex
- if (!MO.isReg() || !MO.isUse())
+ if (!MO.isUse())
return false;
if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
@@ -1644,6 +1877,16 @@ static bool shouldReadExec(const MachineInstr &MI) {
return true;
}
+static bool isSubRegOf(const SIRegisterInfo &TRI,
+ const MachineOperand &SuperVec,
+ const MachineOperand &SubReg) {
+ if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
+ return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
+
+ return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
+ SubReg.getReg() == SuperVec.getReg();
+}
+
bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
StringRef &ErrInfo) const {
uint16_t Opcode = MI.getOpcode();
@@ -1660,6 +1903,28 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return false;
}
+ if (MI.isInlineAsm()) {
+ // Verify register classes for inlineasm constraints.
+ for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
+ I != E; ++I) {
+ const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
+ if (!RC)
+ continue;
+
+ const MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg())
+ continue;
+
+ unsigned Reg = Op.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
+ ErrInfo = "inlineasm operand has incorrect register class.";
+ return false;
+ }
+ }
+
+ return true;
+ }
+
// Make sure the register classes are correct.
for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
if (MI.getOperand(i).isFPImm()) {
@@ -1677,15 +1942,22 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return false;
}
break;
- case AMDGPU::OPERAND_REG_IMM32:
+ case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_IMM_FP32:
break;
- case AMDGPU::OPERAND_REG_INLINE_C:
- if (isLiteralConstant(MI.getOperand(i),
- RI.getRegClass(RegClass)->getSize())) {
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
ErrInfo = "Illegal immediate value for operand.";
return false;
}
break;
+ }
case MCOI::OPERAND_IMMEDIATE:
case AMDGPU::OPERAND_KIMM32:
// Check if this operand is an immediate.
@@ -1695,7 +1967,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
ErrInfo = "Expected immediate, but got non-immediate";
return false;
}
- // Fall-through
+ LLVM_FALLTHROUGH;
default:
continue;
}
@@ -1737,7 +2009,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (OpIdx == -1)
break;
const MachineOperand &MO = MI.getOperand(OpIdx);
- if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
+ if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
if (MO.isReg()) {
if (MO.getReg() != SGPRUsed)
++ConstantBusCount;
@@ -1768,6 +2040,65 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (isSOPK(MI)) {
+ int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
+ if (sopkIsZext(MI)) {
+ if (!isUInt<16>(Imm)) {
+ ErrInfo = "invalid immediate for SOPK instruction";
+ return false;
+ }
+ } else {
+ if (!isInt<16>(Imm)) {
+ ErrInfo = "invalid immediate for SOPK instruction";
+ return false;
+ }
+ }
+ }
+
+ if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
+ Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
+ Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
+ Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
+ const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
+ Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
+
+ const unsigned StaticNumOps = Desc.getNumOperands() +
+ Desc.getNumImplicitUses();
+ const unsigned NumImplicitOps = IsDst ? 2 : 1;
+
+ // Allow additional implicit operands. This allows a fixup done by the post
+ // RA scheduler where the main implicit operand is killed and implicit-defs
+ // are added for sub-registers that remain live after this instruction.
+ if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
+ ErrInfo = "missing implicit register operands";
+ return false;
+ }
+
+ const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (IsDst) {
+ if (!Dst->isUse()) {
+ ErrInfo = "v_movreld_b32 vdst should be a use operand";
+ return false;
+ }
+
+ unsigned UseOpIdx;
+ if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
+ UseOpIdx != StaticNumOps + 1) {
+ ErrInfo = "movrel implicit operands should be tied";
+ return false;
+ }
+ }
+
+ const MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ const MachineOperand &ImpUse
+ = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
+ if (!ImpUse.isReg() || !ImpUse.isUse() ||
+ !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
+ ErrInfo = "src0 should be subreg of implicit vector use";
+ return false;
+ }
+ }
+
// Make sure we aren't losing exec uses in the td files. This mostly requires
// being careful when using let Uses to try to add other use registers.
if (shouldReadExec(MI)) {
@@ -1777,6 +2108,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (isSMRD(MI)) {
+ if (MI.mayStore()) {
+ // The register offset form of scalar stores may only use m0 as the
+ // soffset register.
+ const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
+ if (Soff && Soff->getReg() != AMDGPU::M0) {
+ ErrInfo = "scalar stores must use m0 as offset register";
+ return false;
+ }
+ }
+ }
+
return true;
}
@@ -1797,13 +2140,13 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
- case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32;
- case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32;
- case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32;
- case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32;
- case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32;
- case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32;
- case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32;
+ case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
+ case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
+ case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
+ case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
+ case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
+ case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
+ case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
@@ -1830,6 +2173,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
+ case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
+ case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
@@ -1937,11 +2282,10 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
unsigned SubIdx,
const TargetRegisterClass *SubRC) const {
if (Op.isImm()) {
- // XXX - Is there a better way to do this?
if (SubIdx == AMDGPU::sub0)
- return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF);
+ return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
if (SubIdx == AMDGPU::sub1)
- return MachineOperand::CreateImm(Op.getImm() >> 32);
+ return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
llvm_unreachable("Unhandled register index for immediate");
}
@@ -1978,8 +2322,8 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
// In order to be legal, the common sub-class must be equal to the
// class of the current operand. For example:
//
- // v_mov_b32 s0 ; Operand defined as vsrc_32
- // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
+ // v_mov_b32 s0 ; Operand defined as vsrc_b32
+ // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
//
// s_sendmsg 0, s0 ; Operand defined as m0reg
// ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
@@ -2008,7 +2352,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
if (!MO)
MO = &MI.getOperand(OpIdx);
- if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
+ if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
RegSubRegPair SGPRUsed;
if (MO->isReg())
@@ -2020,7 +2364,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand &Op = MI.getOperand(i);
if (Op.isReg()) {
if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
- usesConstantBus(MRI, Op, getOpSize(MI, i))) {
+ usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
return false;
}
} else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
@@ -2202,6 +2546,39 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
}
}
+void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
+ MachineBasicBlock::iterator I,
+ const TargetRegisterClass *DstRC,
+ MachineOperand &Op,
+ MachineRegisterInfo &MRI,
+ const DebugLoc &DL) const {
+
+ unsigned OpReg = Op.getReg();
+ unsigned OpSubReg = Op.getSubReg();
+
+ const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
+ RI.getRegClassForReg(MRI, OpReg), OpSubReg);
+
+ // Check if operand is already the correct register class.
+ if (DstRC == OpRC)
+ return;
+
+ unsigned DstReg = MRI.createVirtualRegister(DstRC);
+ MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg)
+ .addOperand(Op);
+
+ Op.setReg(DstReg);
+ Op.setSubReg(0);
+
+ MachineInstr *Def = MRI.getVRegDef(OpReg);
+ if (!Def)
+ return;
+
+ // Try to eliminate the copy if it is copying an immediate value.
+ if (Def->isMoveImmediate())
+ FoldImmediate(*Copy, *Def, OpReg, &MRI);
+}
+
void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
MachineFunction &MF = *MI.getParent()->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -2260,15 +2637,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
MachineOperand &Op = MI.getOperand(I);
if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
continue;
- unsigned DstReg = MRI.createVirtualRegister(RC);
// MI is a PHI instruction.
MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
- BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg)
- .addOperand(Op);
- Op.setReg(DstReg);
+ // Avoid creating no-op copies with the same src and dst reg class. These
+ // confuse some of the machine passes.
+ legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
}
}
@@ -2292,12 +2668,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
if (VRC == OpRC)
continue;
- unsigned DstReg = MRI.createVirtualRegister(VRC);
-
- BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg)
- .addOperand(Op);
-
- Op.setReg(DstReg);
+ legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
Op.setIsKill();
}
}
@@ -2313,11 +2684,9 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
if (DstRC != Src0RC) {
- MachineBasicBlock &MBB = *MI.getParent();
- unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
- BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
- .addReg(Src0);
- MI.getOperand(1).setReg(NewSrc0);
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineOperand &Op = MI.getOperand(1);
+ legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
}
return;
}
@@ -2664,6 +3033,22 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
continue;
unsigned DstReg = Inst.getOperand(0).getReg();
+ if (Inst.isCopy() &&
+ TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
+ NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
+ // Instead of creating a copy where src and dst are the same register
+ // class, we just replace all uses of dst with src. These kinds of
+ // copies interfere with the heuristics MachineSink uses to decide
+ // whether or not to split a critical edge. Since the pass assumes
+ // that copies will end up as machine instructions and not be
+ // eliminated.
+ addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
+ MRI.clearKillFlags(Inst.getOperand(1).getReg());
+ Inst.getOperand(0).setReg(DstReg);
+ continue;
+ }
+
NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
}
@@ -2927,10 +3312,16 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
MachineRegisterInfo &MRI,
SmallVectorImpl<MachineInstr *> &Worklist) const {
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
- E = MRI.use_end(); I != E; ++I) {
+ E = MRI.use_end(); I != E;) {
MachineInstr &UseMI = *I->getParent();
if (!canReadVGPR(UseMI, I.getOperandNo())) {
Worklist.push_back(&UseMI);
+
+ do {
+ ++I;
+ } while (I != E && I->getParent() == &UseMI);
+ } else {
+ ++I;
}
}
}
@@ -3098,6 +3489,56 @@ bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
}
+unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
+ int &FrameIndex) const {
+ const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
+ if (!Addr || !Addr->isFI())
+ return AMDGPU::NoRegister;
+
+ assert(!MI.memoperands_empty() &&
+ (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
+
+ FrameIndex = Addr->getIndex();
+ return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
+}
+
+unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
+ int &FrameIndex) const {
+ const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
+ assert(Addr && Addr->isFI());
+ FrameIndex = Addr->getIndex();
+ return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
+}
+
+unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+
+ if (!MI.mayLoad())
+ return AMDGPU::NoRegister;
+
+ if (isMUBUF(MI) || isVGPRSpill(MI))
+ return isStackAccess(MI, FrameIndex);
+
+ if (isSGPRSpill(MI))
+ return isSGPRStackAccess(MI, FrameIndex);
+
+ return AMDGPU::NoRegister;
+}
+
+unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ if (!MI.mayStore())
+ return AMDGPU::NoRegister;
+
+ if (isMUBUF(MI) || isVGPRSpill(MI))
+ return isStackAccess(MI, FrameIndex);
+
+ if (isSGPRSpill(MI))
+ return isSGPRStackAccess(MI, FrameIndex);
+
+ return AMDGPU::NoRegister;
+}
+
unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
@@ -3105,32 +3546,45 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
// If we have a definitive size, we can use it. Otherwise we need to inspect
// the operands to know the size.
- if (DescSize == 8 || DescSize == 4)
+ //
+ // FIXME: Instructions that have a base 32-bit encoding report their size as
+ // 4, even though they are really 8 bytes if they have a literal operand.
+ if (DescSize != 0 && DescSize != 4)
return DescSize;
- assert(DescSize == 0);
+ if (Opc == AMDGPU::WAVE_BARRIER)
+ return 0;
// 4-byte instructions may have a 32-bit literal encoded after them. Check
// operands that coud ever be literals.
if (isVALU(MI) || isSALU(MI)) {
+ if (isFixedSize(MI)) {
+ assert(DescSize == 4);
+ return DescSize;
+ }
+
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
if (Src0Idx == -1)
return 4; // No operands.
- if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx)))
+ if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
return 8;
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (Src1Idx == -1)
return 4;
- if (isLiteralConstant(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx)))
+ if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
return 8;
return 4;
}
+ if (DescSize == 4)
+ return 4;
+
switch (Opc) {
+ case AMDGPU::SI_MASK_BRANCH:
case TargetOpcode::IMPLICIT_DEF:
case TargetOpcode::KILL:
case TargetOpcode::DBG_VALUE:
@@ -3147,6 +3601,20 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
}
}
+bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
+ if (!isFLAT(MI))
+ return false;
+
+ if (MI.memoperands_empty())
+ return true;
+
+ for (const MachineMemOperand *MMO : MI.memoperands()) {
+ if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
+ return true;
+ }
+ return false;
+}
+
ArrayRef<std::pair<int, const char *>>
SIInstrInfo::getSerializableTargetIndices() const {
static const std::pair<int, const char *> TargetIndices[] = {
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index fef8904..e68f6f9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -86,6 +86,10 @@ private:
unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
protected:
+ bool swapSourceModifiers(MachineInstr &MI,
+ MachineOperand &Src0, unsigned Src0OpName,
+ MachineOperand &Src1, unsigned Src1OpName) const;
+
MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx0,
unsigned OpIdx1) const override;
@@ -94,7 +98,18 @@ public:
enum TargetOperandFlags {
MO_NONE = 0,
- MO_GOTPCREL = 1
+ // MO_GOTPCREL -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL.
+ MO_GOTPCREL = 1,
+ // MO_GOTPCREL32_LO -> symbol@gotpcrel32@lo -> R_AMDGPU_GOTPCREL32_LO.
+ MO_GOTPCREL32 = 2,
+ MO_GOTPCREL32_LO = 2,
+ // MO_GOTPCREL32_HI -> symbol@gotpcrel32@hi -> R_AMDGPU_GOTPCREL32_HI.
+ MO_GOTPCREL32_HI = 3,
+ // MO_REL32_LO -> symbol@rel32@lo -> R_AMDGPU_REL32_LO.
+ MO_REL32 = 4,
+ MO_REL32_LO = 4,
+ // MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI.
+ MO_REL32_HI = 5
};
explicit SIInstrInfo(const SISubtarget &);
@@ -144,23 +159,48 @@ public:
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
LLVM_READONLY
- int commuteOpcode(const MachineInstr &MI) const;
+ int commuteOpcode(unsigned Opc) const;
+
+ LLVM_READONLY
+ inline int commuteOpcode(const MachineInstr &MI) const {
+ return commuteOpcode(MI.getOpcode());
+ }
bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
+ bool isBranchOffsetInRange(unsigned BranchOpc,
+ int64_t BrOffset) const override;
+
+ MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
+
+ unsigned insertIndirectBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock &NewDestBB,
+ const DebugLoc &DL,
+ int64_t BrOffset,
+ RegScavenger *RS = nullptr) const override;
+
+ bool analyzeBranchImpl(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const;
+
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const override;
- unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
- unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
- const DebugLoc &DL) const override;
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
- bool ReverseBranchCondition(
+ bool reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const override;
bool
@@ -332,6 +372,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
+ static bool isEXP(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::EXP;
+ }
+
+ bool isEXP(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::EXP;
+ }
+
static bool isWQM(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::WQM;
}
@@ -356,6 +404,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
}
+ static bool isSGPRSpill(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SGPRSpill;
+ }
+
+ bool isSGPRSpill(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill;
+ }
+
static bool isDPP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::DPP;
}
@@ -372,6 +428,32 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT;
}
+ static bool sopkIsZext(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOPK_ZEXT;
+ }
+
+ bool sopkIsZext(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOPK_ZEXT;
+ }
+
+ /// \returns true if this is an s_store_dword* instruction. This is more
+ /// specific than than isSMEM && mayStore.
+ static bool isScalarStore(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SCALAR_STORE;
+ }
+
+ bool isScalarStore(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SCALAR_STORE;
+ }
+
+ static bool isFixedSize(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::FIXED_SIZE;
+ }
+
+ bool isFixedSize(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE;
+ }
+
bool isVGPRCopy(const MachineInstr &MI) const {
assert(MI.isCopy());
unsigned Dest = MI.getOperand(0).getReg();
@@ -380,9 +462,96 @@ public:
return !RI.isSGPRReg(MRI, Dest);
}
+ static int operandBitWidth(uint8_t OperandType) {
+ switch (OperandType) {
+ case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ return 32;
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ return 64;
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_IMM_INT16:
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ return 16;
+ default:
+ llvm_unreachable("unexpected operand type");
+ }
+ }
+
bool isInlineConstant(const APInt &Imm) const;
- bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
- bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
+
+ bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
+
+ bool isInlineConstant(const MachineOperand &MO,
+ const MCOperandInfo &OpInfo) const {
+ return isInlineConstant(MO, OpInfo.OperandType);
+ }
+
+ /// \p returns true if \p UseMO is substituted with \p DefMO in \p MI it would
+ /// be an inline immediate.
+ bool isInlineConstant(const MachineInstr &MI,
+ const MachineOperand &UseMO,
+ const MachineOperand &DefMO) const {
+ assert(UseMO.getParent() == &MI);
+ int OpIdx = MI.getOperandNo(&UseMO);
+ if (!MI.getDesc().OpInfo || OpIdx >= MI.getDesc().NumOperands) {
+ return false;
+ }
+
+ return isInlineConstant(DefMO, MI.getDesc().OpInfo[OpIdx]);
+ }
+
+ /// \p returns true if the operand \p OpIdx in \p MI is a valid inline
+ /// immediate.
+ bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx) const {
+ const MachineOperand &MO = MI.getOperand(OpIdx);
+ return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType);
+ }
+
+ bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx,
+ const MachineOperand &MO) const {
+ if (!MI.getDesc().OpInfo || OpIdx >= MI.getDesc().NumOperands)
+ return false;
+
+ if (MI.isCopy()) {
+ unsigned Size = getOpSize(MI, OpIdx);
+ assert(Size == 8 || Size == 4);
+
+ uint8_t OpType = (Size == 8) ?
+ AMDGPU::OPERAND_REG_IMM_INT64 : AMDGPU::OPERAND_REG_IMM_INT32;
+ return isInlineConstant(MO, OpType);
+ }
+
+ return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType);
+ }
+
+ bool isInlineConstant(const MachineOperand &MO) const {
+ const MachineInstr *Parent = MO.getParent();
+ return isInlineConstant(*Parent, Parent->getOperandNo(&MO));
+ }
+
+ bool isLiteralConstant(const MachineOperand &MO,
+ const MCOperandInfo &OpInfo) const {
+ return MO.isImm() && !isInlineConstant(MO, OpInfo.OperandType);
+ }
+
+ bool isLiteralConstant(const MachineInstr &MI, int OpIdx) const {
+ const MachineOperand &MO = MI.getOperand(OpIdx);
+ return MO.isImm() && !isInlineConstant(MI, OpIdx);
+ }
+
+ // Returns true if this operand could potentially require a 32-bit literal
+ // operand, but not necessarily. A FrameIndex for example could resolve to an
+ // inline immediate value that will not require an additional 4-bytes; this
+ // assumes that it will.
+ bool isLiteralConstantLike(const MachineOperand &MO,
+ const MCOperandInfo &OpInfo) const;
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
const MachineOperand &MO) const;
@@ -394,7 +563,7 @@ public:
/// \brief Returns true if this operand uses the constant bus.
bool usesConstantBus(const MachineRegisterInfo &MRI,
const MachineOperand &MO,
- unsigned OpSize) const;
+ const MCOperandInfo &OpInfo) const;
/// \brief Return true if this instruction has any modifiers.
/// e.g. src[012]_mod, omod, clamp.
@@ -487,6 +656,12 @@ public:
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const;
+ void legalizeGenericOperand(MachineBasicBlock &InsertMBB,
+ MachineBasicBlock::iterator I,
+ const TargetRegisterClass *DstRC,
+ MachineOperand &Op, MachineRegisterInfo &MRI,
+ const DebugLoc &DL) const;
+
/// \brief Legalize all operands in this instruction. This function may
/// create new instruction and insert them before \p MI.
void legalizeOperands(MachineInstr &MI) const;
@@ -535,7 +710,17 @@ public:
return get(pseudoToMCOpcode(Opcode));
}
- unsigned getInstSizeInBytes(const MachineInstr &MI) const;
+ unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const;
+ unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const;
+
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+ bool mayAccessFlatAddressSpace(const MachineInstr &MI) const;
ArrayRef<std::pair<int, const char *>>
getSerializableTargetIndices() const override;
@@ -570,10 +755,19 @@ namespace AMDGPU {
LLVM_READONLY
int getAtomicNoRetOp(uint16_t Opcode);
+ LLVM_READONLY
+ int getSOPKOp(uint16_t Opcode);
+
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23);
+
+ // For MachineOperands.
+ enum TargetFlags {
+ TF_LONG_BRANCH_FORWARD = 1 << 0,
+ TF_LONG_BRANCH_BACKWARD = 1 << 1
+ };
} // End namespace AMDGPU
namespace SI {
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 00f53e8..ebaefae 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -14,75 +14,6 @@ def isCIOnly : Predicate<"Subtarget->getGeneration() =="
def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
-class vop {
- field bits<9> SI3;
- field bits<10> VI3;
-}
-
-class vopc <bits<8> si, bits<8> vi = !add(0x40, si)> : vop {
- field bits<8> SI = si;
- field bits<8> VI = vi;
-
- field bits<9> SI3 = {0, si{7-0}};
- field bits<10> VI3 = {0, 0, vi{7-0}};
-}
-
-class vop1 <bits<8> si, bits<8> vi = si> : vop {
- field bits<8> SI = si;
- field bits<8> VI = vi;
-
- field bits<9> SI3 = {1, 1, si{6-0}};
- field bits<10> VI3 = !add(0x140, vi);
-}
-
-class vop2 <bits<6> si, bits<6> vi = si> : vop {
- field bits<6> SI = si;
- field bits<6> VI = vi;
-
- field bits<9> SI3 = {1, 0, 0, si{5-0}};
- field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}};
-}
-
-// Specify a VOP2 opcode for SI and VOP3 opcode for VI
-// that doesn't have VOP2 encoding on VI
-class vop23 <bits<6> si, bits<10> vi> : vop2 <si> {
- let VI3 = vi;
-}
-
-class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop {
- let SI3 = si;
- let VI3 = vi;
-}
-
-class sop1 <bits<8> si, bits<8> vi = si> {
- field bits<8> SI = si;
- field bits<8> VI = vi;
-}
-
-class sop2 <bits<7> si, bits<7> vi = si> {
- field bits<7> SI = si;
- field bits<7> VI = vi;
-}
-
-class sopk <bits<5> si, bits<5> vi = si> {
- field bits<5> SI = si;
- field bits<5> VI = vi;
-}
-
-class dsop <bits<8> si, bits<8> vi = si> {
- field bits<8> SI = si;
- field bits<8> VI = vi;
-}
-
-// Specify an SMRD opcode for SI and SMEM opcode for VI
-
-// FIXME: This should really be bits<5> si, Tablegen crashes if
-// parameter default value is other parameter with different bit size
-class smrd<bits<8> si, bits<8> vi = si> {
- field bits<5> SI = si{4-0};
- field bits<8> VI = vi;
-}
-
// Execpt for the NONE field, this must be kept in sync with the
// SIEncodingFamily enum in AMDGPUInstrInfo.cpp
def SIEncodingFamily {
@@ -127,6 +58,19 @@ def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
+def SDTBufferLoad : SDTypeProfile<1, 5,
+ [ // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // vindex
+ SDTCisVT<3, i32>, // offset
+ SDTCisVT<4, i1>, // glc
+ SDTCisVT<5, i1>]>; // slc
+
+def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+
def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
SDTCisVT<3, i32>]>
@@ -143,72 +87,15 @@ def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
- SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
+ SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
>;
//===----------------------------------------------------------------------===//
-// PatFrags for FLAT instructions
-//===----------------------------------------------------------------------===//
-
-class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
- (ld node:$ptr), [{
- const MemSDNode *LD = cast<MemSDNode>(N);
- return LD->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
- LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
- LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
-}]>;
-
-def flat_load : flat_ld <load>;
-def atomic_flat_load : flat_ld<atomic_load>;
-def flat_az_extloadi8 : flat_ld <az_extloadi8>;
-def flat_sextloadi8 : flat_ld <sextloadi8>;
-def flat_az_extloadi16 : flat_ld <az_extloadi16>;
-def flat_sextloadi16 : flat_ld <sextloadi16>;
-
-class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
- (st node:$val, node:$ptr), [{
- const MemSDNode *ST = cast<MemSDNode>(N);
- return ST->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
- ST->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
-}]>;
-
-def flat_store: flat_st <store>;
-def atomic_flat_store: flat_st <atomic_store>;
-def flat_truncstorei8 : flat_st <truncstorei8>;
-def flat_truncstorei16 : flat_st <truncstorei16>;
-
-class MubufLoad <SDPatternOperator op> : PatFrag <
- (ops node:$ptr), (op node:$ptr), [{
-
- const MemSDNode *LD = cast<MemSDNode>(N);
- return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
- LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
-}]>;
-
-def mubuf_load : MubufLoad <load>;
-def mubuf_az_extloadi8 : MubufLoad <az_extloadi8>;
-def mubuf_sextloadi8 : MubufLoad <sextloadi8>;
-def mubuf_az_extloadi16 : MubufLoad <az_extloadi16>;
-def mubuf_sextloadi16 : MubufLoad <sextloadi16>;
-
-def mubuf_load_atomic : MubufLoad <atomic_load>;
-
-def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
- auto Ld = cast<LoadSDNode>(N);
- return Ld->getAlignment() >= 4 &&
- Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
- static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
-}]>;
-
-//===----------------------------------------------------------------------===//
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
-def atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
-def atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
-
-def atomic_inc_flat : flat_binary_atomic_op<SIatomic_inc>;
-def atomic_dec_flat : flat_binary_atomic_op<SIatomic_dec>;
+defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
+defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
//===----------------------------------------------------------------------===//
// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
@@ -338,36 +225,6 @@ def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>;
-// Transformation function, extract the lower 32bit of a 64bit immediate
-def LO32 : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N),
- MVT::i32);
-}]>;
-
-def LO32f : SDNodeXForm<fpimm, [{
- APInt V = N->getValueAPF().bitcastToAPInt().trunc(32);
- return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32);
-}]>;
-
-// Transformation function, extract the upper 32bit of a 64bit immediate
-def HI32 : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() >> 32, SDLoc(N), MVT::i32);
-}]>;
-
-def HI32f : SDNodeXForm<fpimm, [{
- APInt V = N->getValueAPF().bitcastToAPInt().lshr(32).trunc(32);
- return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), SDLoc(N),
- MVT::f32);
-}]>;
-
-def IMM8bitDWORD : PatLeaf <(imm),
- [{return (N->getZExtValue() & ~0x3FC) == 0;}]
->;
-
-def as_dword_i32imm : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() >> 2, SDLoc(N), MVT::i32);
-}]>;
-
def as_i1imm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
}]>;
@@ -394,24 +251,17 @@ return CurDAG->getTargetConstant(
N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
}]>;
+def frameindex_to_targetframeindex : SDNodeXForm<frameindex, [{
+ auto FI = cast<FrameIndexSDNode>(N);
+ return CurDAG->getTargetFrameIndex(FI->getIndex(), MVT::i32);
+}]>;
+
// Copied from the AArch64 backend:
def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
return CurDAG->getTargetConstant(
N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
}]>;
-def IMM8bit : PatLeaf <(imm),
- [{return isUInt<8>(N->getZExtValue());}]
->;
-
-def IMM12bit : PatLeaf <(imm),
- [{return isUInt<12>(N->getZExtValue());}]
->;
-
-def IMM16bit : PatLeaf <(imm),
- [{return isUInt<16>(N->getZExtValue());}]
->;
-
def SIMM16bit : PatLeaf <(imm),
[{return isInt<16>(N->getSExtValue());}]
>;
@@ -420,15 +270,6 @@ def IMM20bit : PatLeaf <(imm),
[{return isUInt<20>(N->getZExtValue());}]
>;
-def IMM32bit : PatLeaf <(imm),
- [{return isUInt<32>(N->getZExtValue());}]
->;
-
-def mubuf_vaddr_offset : PatFrag<
- (ops node:$ptr, node:$offset, node:$imm_offset),
- (add (add node:$ptr, node:$offset), node:$imm_offset)
->;
-
class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
return isInlineImmediate(N);
}]>;
@@ -437,29 +278,31 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
return isInlineImmediate(N);
}]>;
-class SGPRImm <dag frag> : PatLeaf<frag, [{
+class VGPRImm <dag frag> : PatLeaf<frag, [{
if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) {
return false;
}
const SIRegisterInfo *SIRI =
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+ unsigned Limit = 0;
for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
- U != E; ++U) {
+ Limit < 10 && U != E; ++U, ++Limit) {
const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
- if (RC && SIRI->isSGPRClass(RC))
- return true;
+
+ // If the register class is unknown, it could be an unknown
+ // register class that needs to be an SGPR, e.g. an inline asm
+ // constraint
+ if (!RC || SIRI->isSGPRClass(RC))
+ return false;
}
- return false;
+
+ return Limit < 10;
}]>;
//===----------------------------------------------------------------------===//
// Custom Operands
//===----------------------------------------------------------------------===//
-def FRAMEri32 : Operand<iPTR> {
- let MIOperandInfo = (ops i32:$ptr, i32imm:$index);
-}
-
def SoppBrTarget : AsmOperandClass {
let Name = "SoppBrTarget";
let ParserMethod = "parseSOppBrTarget";
@@ -467,14 +310,51 @@ def SoppBrTarget : AsmOperandClass {
def sopp_brtarget : Operand<OtherVT> {
let EncoderMethod = "getSOPPBrEncoding";
+ let DecoderMethod = "decodeSoppBrTarget";
let OperandType = "OPERAND_PCREL";
let ParserMatchClass = SoppBrTarget;
}
def si_ga : Operand<iPTR>;
+def InterpSlotMatchClass : AsmOperandClass {
+ let Name = "InterpSlot";
+ let PredicateMethod = "isInterpSlot";
+ let ParserMethod = "parseInterpSlot";
+ let RenderMethod = "addImmOperands";
+}
+
def InterpSlot : Operand<i32> {
let PrintMethod = "printInterpSlot";
+ let ParserMatchClass = InterpSlotMatchClass;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def AttrMatchClass : AsmOperandClass {
+ let Name = "Attr";
+ let PredicateMethod = "isInterpAttr";
+ let ParserMethod = "parseInterpAttr";
+ let RenderMethod = "addImmOperands";
+}
+
+// It appears to be necessary to create a separate operand for this to
+// be able to parse attr<num> with no space.
+def Attr : Operand<i32> {
+ let PrintMethod = "printInterpAttr";
+ let ParserMatchClass = AttrMatchClass;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def AttrChanMatchClass : AsmOperandClass {
+ let Name = "AttrChan";
+ let PredicateMethod = "isAttrChan";
+ let RenderMethod = "addImmOperands";
+}
+
+def AttrChan : Operand<i32> {
+ let PrintMethod = "printInterpAttrChan";
+ let ParserMatchClass = AttrChanMatchClass;
+ let OperandType = "OPERAND_IMMEDIATE";
}
def SendMsgMatchClass : AsmOperandClass {
@@ -484,6 +364,13 @@ def SendMsgMatchClass : AsmOperandClass {
let RenderMethod = "addImmOperands";
}
+def ExpTgtMatchClass : AsmOperandClass {
+ let Name = "ExpTgt";
+ let PredicateMethod = "isExpTgt";
+ let ParserMethod = "parseExpTgt";
+ let RenderMethod = "printExpTgt";
+}
+
def SendMsgImm : Operand<i32> {
let PrintMethod = "printSendMsg";
let ParserMatchClass = SendMsgMatchClass;
@@ -495,6 +382,11 @@ def SWaitMatchClass : AsmOperandClass {
let ParserMethod = "parseSWaitCntOps";
}
+def VReg32OrOffClass : AsmOperandClass {
+ let Name = "VReg32OrOff";
+ let ParserMethod = "parseVReg32OrOff";
+}
+
def WAIT_FLAG : Operand <i32> {
let ParserMatchClass = SWaitMatchClass;
let PrintMethod = "printWaitFlag";
@@ -503,6 +395,31 @@ def WAIT_FLAG : Operand <i32> {
include "SIInstrFormats.td"
include "VIInstrFormats.td"
+// ===----------------------------------------------------------------------===//
+// ExpSrc* Special cases for exp src operands which are printed as
+// "off" depending on en operand.
+// ===----------------------------------------------------------------------===//
+
+def ExpSrc0 : RegisterOperand<VGPR_32> {
+ let PrintMethod = "printExpSrc0";
+ let ParserMatchClass = VReg32OrOffClass;
+}
+
+def ExpSrc1 : RegisterOperand<VGPR_32> {
+ let PrintMethod = "printExpSrc1";
+ let ParserMatchClass = VReg32OrOffClass;
+}
+
+def ExpSrc2 : RegisterOperand<VGPR_32> {
+ let PrintMethod = "printExpSrc2";
+ let ParserMatchClass = VReg32OrOffClass;
+}
+
+def ExpSrc3 : RegisterOperand<VGPR_32> {
+ let PrintMethod = "printExpSrc3";
+ let ParserMatchClass = VReg32OrOffClass;
+}
+
class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
let Name = "Imm"#CName;
let PredicateMethod = "is"#CName;
@@ -547,16 +464,15 @@ def gds : NamedOperandBit<"GDS", NamedMatchClass<"GDS">>;
def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>;
def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
-def smrd_offset : NamedOperandU32<"SMRDOffset", NamedMatchClass<"SMRDOffset">>;
-def smrd_literal_offset : NamedOperandU32<"SMRDLiteralOffset", NamedMatchClass<"SMRDLiteralOffset">>;
-
-def glc : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
+def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
+def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
+def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
@@ -572,33 +488,96 @@ def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused
def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
+def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
+
+}
+
} // End OperandType = "OPERAND_IMMEDIATE"
+class KImmMatchClass<int size> : AsmOperandClass {
+ let Name = "KImmFP"#size;
+ let PredicateMethod = "isKImmFP"#size;
+ let ParserMethod = "parseImm";
+ let RenderMethod = "addKImmFP"#size#"Operands";
+}
+
+class kimmOperand<ValueType vt> : Operand<vt> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_KIMM"#vt.Size;
+ let PrintMethod = "printU"#vt.Size#"ImmOperand";
+ let ParserMatchClass = !cast<AsmOperandClass>("KImmFP"#vt.Size#"MatchClass");
+}
+
+// 32-bit VALU immediate operand that uses the constant bus.
+def KImmFP32MatchClass : KImmMatchClass<32>;
+def f32kimm : kimmOperand<i32>;
+
+// 32-bit VALU immediate operand with a 16-bit value that uses the
+// constant bus.
+def KImmFP16MatchClass : KImmMatchClass<16>;
+def f16kimm : kimmOperand<i16>;
+
def VOPDstS64 : VOPDstOperand <SReg_64>;
-def FPInputModsMatchClass : AsmOperandClass {
- let Name = "RegOrImmWithFPInputMods";
+class FPInputModsMatchClass <int opSize> : AsmOperandClass {
+ let Name = "RegOrImmWithFP"#opSize#"InputMods";
let ParserMethod = "parseRegOrImmWithFPInputMods";
- let PredicateMethod = "isRegOrImmWithInputMods";
+ let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods";
}
+def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
+def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
+def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
-def FPInputMods : Operand <i32> {
+class InputMods <AsmOperandClass matchClass> : Operand <i32> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_INPUT_MODS";
+ let ParserMatchClass = matchClass;
+}
+
+class FPInputMods <FPInputModsMatchClass matchClass> : InputMods <matchClass> {
let PrintMethod = "printOperandAndFPInputMods";
- let ParserMatchClass = FPInputModsMatchClass;
}
-def IntInputModsMatchClass : AsmOperandClass {
- let Name = "RegOrImmWithIntInputMods";
+def FP16InputMods : FPInputMods<FP16InputModsMatchClass>;
+def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
+def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
+
+class IntInputModsMatchClass <int opSize> : AsmOperandClass {
+ let Name = "RegOrImmWithInt"#opSize#"InputMods";
let ParserMethod = "parseRegOrImmWithIntInputMods";
- let PredicateMethod = "isRegOrImmWithInputMods";
+ let PredicateMethod = "isRegOrImmWithInt"#opSize#"InputMods";
+}
+def Int32InputModsMatchClass : IntInputModsMatchClass<32>;
+def Int64InputModsMatchClass : IntInputModsMatchClass<64>;
+
+class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> {
+ let PrintMethod = "printOperandAndIntInputMods";
+}
+def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
+def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
+
+def FPVRegInputModsMatchClass : AsmOperandClass {
+ let Name = "VRegWithFPInputMods";
+ let ParserMethod = "parseRegWithFPInputMods";
+ let PredicateMethod = "isVReg";
}
-def IntInputMods: Operand <i32> {
+def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
+ let PrintMethod = "printOperandAndFPInputMods";
+}
+
+def IntVRegInputModsMatchClass : AsmOperandClass {
+ let Name = "VRegWithIntInputMods";
+ let ParserMethod = "parseRegWithIntInputMods";
+ let PredicateMethod = "isVReg";
+}
+
+def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
let PrintMethod = "printOperandAndIntInputMods";
- let ParserMatchClass = IntInputModsMatchClass;
}
+
//===----------------------------------------------------------------------===//
// Complex patterns
//===----------------------------------------------------------------------===//
@@ -606,24 +585,6 @@ def IntInputMods: Operand <i32> {
def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
-def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
-def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
-def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
-def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">;
-def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
-def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
-def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
-def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
-def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">;
-def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">;
-
-def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">;
-def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
-def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
-def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
-def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
-def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
-
def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
@@ -681,455 +642,44 @@ class SIMCInstr <string pseudo, int subtarget> {
// EXP classes
//===----------------------------------------------------------------------===//
-class EXPCommon : InstSI<
+class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon<
(outs),
- (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
- VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3),
- "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
- [] > {
-
- let EXP_CNT = 1;
- let Uses = [EXEC];
- let SchedRW = [WriteExport];
-}
-
-multiclass EXP_m {
-
- let isPseudo = 1, isCodeGenOnly = 1 in {
- def "" : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.NONE> ;
- }
-
- def _si : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.SI>, EXPe {
- let DecoderNamespace="SICI";
- let DisableDecoder = DisableSIDecoder;
- }
-
- def _vi : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.VI>, EXPe_vi {
- let DecoderNamespace="VI";
- let DisableDecoder = DisableVIDecoder;
- }
-}
-
-//===----------------------------------------------------------------------===//
-// Scalar classes
-//===----------------------------------------------------------------------===//
-
-class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
- SOP1 <outs, ins, "", pattern>,
- SIMCInstr<opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
-}
-
-class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> :
- SOP1 <outs, ins, asm, []>,
- SOP1e <op.SI>,
- SIMCInstr<opName, SIEncodingFamily.SI> {
- let isCodeGenOnly = 0;
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> :
- SOP1 <outs, ins, asm, []>,
- SOP1e <op.VI>,
- SIMCInstr<opName, SIEncodingFamily.VI> {
- let isCodeGenOnly = 0;
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm,
- list<dag> pattern> {
-
- def "" : SOP1_Pseudo <opName, outs, ins, pattern>;
-
- def _si : SOP1_Real_si <op, opName, outs, ins, asm>;
-
- def _vi : SOP1_Real_vi <op, opName, outs, ins, asm>;
-
-}
-
-multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
- op, opName, (outs SReg_32:$sdst), (ins SSrc_32:$src0),
- opName#" $sdst, $src0", pattern
->;
-
-multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
- op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0),
- opName#" $sdst, $src0", pattern
->;
-
-// no input, 64-bit output.
-multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> {
- def "" : SOP1_Pseudo <opName, (outs SReg_64:$sdst), (ins), pattern>;
-
- def _si : SOP1_Real_si <op, opName, (outs SReg_64:$sdst), (ins),
- opName#" $sdst"> {
- let src0 = 0;
- }
-
- def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$sdst), (ins),
- opName#" $sdst"> {
- let src0 = 0;
- }
-}
-
-// 64-bit input, no output
-multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> {
- def "" : SOP1_Pseudo <opName, (outs), (ins SReg_64:$src0), pattern>;
-
- def _si : SOP1_Real_si <op, opName, (outs), (ins SReg_64:$src0),
- opName#" $src0"> {
- let sdst = 0;
- }
-
- def _vi : SOP1_Real_vi <op, opName, (outs), (ins SReg_64:$src0),
- opName#" $src0"> {
- let sdst = 0;
- }
-}
-
-// 64-bit input, 32-bit output.
-multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
- op, opName, (outs SReg_32:$sdst), (ins SSrc_64:$src0),
- opName#" $sdst, $src0", pattern
->;
-
-// 32-bit input, 64-bit output.
-multiclass SOP1_64_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
- op, opName, (outs SReg_64:$sdst), (ins SSrc_32:$src0),
- opName#" $sdst, $src0", pattern
->;
-
-class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> :
- SOP2<outs, ins, "", pattern>,
- SIMCInstr<opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let Size = 4;
-
- // Pseudo instructions have no encodings, but adding this field here allows
- // us to do:
- // let sdst = xxx in {
- // for multiclasses that include both real and pseudo instructions.
- field bits<7> sdst = 0;
-}
-
-class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> :
- SOP2<outs, ins, asm, []>,
- SOP2e<op.SI>,
- SIMCInstr<opName, SIEncodingFamily.SI> {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> :
- SOP2<outs, ins, asm, []>,
- SOP2e<op.VI>,
- SIMCInstr<opName, SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
- list<dag> pattern> {
-
- def "" : SOP2_Pseudo <opName, outs, ins, pattern>;
-
- def _si : SOP2_Real_si <op, opName, outs, ins, asm>;
-
- def _vi : SOP2_Real_vi <op, opName, outs, ins, asm>;
-
-}
-
-multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
- op, opName, (outs SReg_32:$sdst), (ins SSrc_32:$src0, SSrc_32:$src1),
- opName#" $sdst, $src0, $src1", pattern
->;
-
-multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
- op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0, SSrc_64:$src1),
- opName#" $sdst, $src0, $src1", pattern
->;
-
-multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
- op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0, SSrc_32:$src1),
- opName#" $sdst, $src0, $src1", pattern
->;
-
-multiclass SOP2_64_32_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
- op, opName, (outs SReg_64:$sdst), (ins SSrc_32:$src0, SSrc_32:$src1),
- opName#" $sdst, $src0, $src1", pattern
->;
-
-class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
- string opName, list<dag> pattern = []> : SOPC <
- op, (outs), (ins rc0:$src0, rc1:$src1),
- opName#" $src0, $src1", pattern > {
- let Defs = [SCC];
-}
-class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
- string opName, PatLeaf cond> : SOPC_Base <
- op, rc, rc, opName,
- [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > {
-}
-
-class SOPC_CMP_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
- : SOPC_Helper<op, SSrc_32, i32, opName, cond>;
-
-class SOPC_32<bits<7> op, string opName, list<dag> pattern = []>
- : SOPC_Base<op, SSrc_32, SSrc_32, opName, pattern>;
-
-class SOPC_64_32<bits<7> op, string opName, list<dag> pattern = []>
- : SOPC_Base<op, SSrc_64, SSrc_32, opName, pattern>;
-
-class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
- SOPK <outs, ins, "", pattern>,
- SIMCInstr<opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
-}
-
-class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> :
- SOPK <outs, ins, asm, []>,
- SOPKe <op.SI>,
- SIMCInstr<opName, SIEncodingFamily.SI> {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
- let isCodeGenOnly = 0;
-}
-
-class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> :
- SOPK <outs, ins, asm, []>,
- SOPKe <op.VI>,
- SIMCInstr<opName, SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
- let isCodeGenOnly = 0;
-}
-
-multiclass SOPK_m <sopk op, string opName, dag outs, dag ins, string opAsm,
- string asm = opName#opAsm> {
- def "" : SOPK_Pseudo <opName, outs, ins, []>;
-
- def _si : SOPK_Real_si <op, opName, outs, ins, asm>;
-
- def _vi : SOPK_Real_vi <op, opName, outs, ins, asm>;
-
-}
-
-multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> {
- def "" : SOPK_Pseudo <opName, (outs SReg_32:$sdst), (ins u16imm:$simm16),
- pattern>;
-
- def _si : SOPK_Real_si <op, opName, (outs SReg_32:$sdst), (ins u16imm:$simm16),
- opName#" $sdst, $simm16">;
-
- def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$sdst), (ins u16imm:$simm16),
- opName#" $sdst, $simm16">;
-}
-
-multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> {
- def "" : SOPK_Pseudo <opName, (outs),
- (ins SReg_32:$src0, u16imm:$src1), pattern> {
- let Defs = [SCC];
- }
-
-
- def _si : SOPK_Real_si <op, opName, (outs),
- (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> {
- let Defs = [SCC];
- }
-
- def _vi : SOPK_Real_vi <op, opName, (outs),
- (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> {
- let Defs = [SCC];
- }
-}
-
-multiclass SOPK_32TIE <sopk op, string opName, list<dag> pattern> : SOPK_m <
- op, opName, (outs SReg_32:$sdst), (ins SReg_32:$src0, u16imm:$simm16),
- " $sdst, $simm16"
->;
-
-multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins,
- string argAsm, string asm = opName#argAsm> {
-
- def "" : SOPK_Pseudo <opName, outs, ins, []>;
-
- def _si : SOPK <outs, ins, asm, []>,
- SOPK64e <op.SI>,
- SIMCInstr<opName, SIEncodingFamily.SI> {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
- let isCodeGenOnly = 0;
- }
-
- def _vi : SOPK <outs, ins, asm, []>,
- SOPK64e <op.VI>,
- SIMCInstr<opName, SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
- let isCodeGenOnly = 0;
- }
-}
-//===----------------------------------------------------------------------===//
-// SMRD classes
-//===----------------------------------------------------------------------===//
-
-class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
- SMRD <outs, ins, "", pattern>,
- SIMCInstr<opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
-}
-
-class SMRD_IMM_Real_si <bits<5> op, string opName, dag outs, dag ins,
- string asm> :
- SMRD <outs, ins, asm, []>,
- SMRD_IMMe <op>,
- SIMCInstr<opName, SIEncodingFamily.SI> {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class SMRD_SOFF_Real_si <bits<5> op, string opName, dag outs, dag ins,
- string asm> :
- SMRD <outs, ins, asm, []>,
- SMRD_SOFFe <op>,
- SIMCInstr<opName, SIEncodingFamily.SI> {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-
-class SMRD_IMM_Real_vi <bits<8> op, string opName, dag outs, dag ins,
- string asm, list<dag> pattern = []> :
- SMRD <outs, ins, asm, pattern>,
- SMEM_IMMe_vi <op>,
- SIMCInstr<opName, SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-class SMRD_SOFF_Real_vi <bits<8> op, string opName, dag outs, dag ins,
- string asm, list<dag> pattern = []> :
- SMRD <outs, ins, asm, pattern>,
- SMEM_SOFFe_vi <op>,
- SIMCInstr<opName, SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-
-multiclass SMRD_IMM_m <smrd op, string opName, dag outs, dag ins,
- string asm, list<dag> pattern> {
-
- def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
-
- def _si : SMRD_IMM_Real_si <op.SI, opName, outs, ins, asm>;
-
- // glc is only applicable to scalar stores, which are not yet
- // implemented.
- let glc = 0 in {
- def _vi : SMRD_IMM_Real_vi <op.VI, opName, outs, ins, asm>;
- }
-}
-
-multiclass SMRD_SOFF_m <smrd op, string opName, dag outs, dag ins,
- string asm, list<dag> pattern> {
-
- def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
-
- def _si : SMRD_SOFF_Real_si <op.SI, opName, outs, ins, asm>;
-
- // glc is only applicable to scalar stores, which are not yet
- // implemented.
- let glc = 0 in {
- def _vi : SMRD_SOFF_Real_vi <op.VI, opName, outs, ins, asm>;
- }
-}
-
-multiclass SMRD_Special <smrd op, string opName, dag outs,
- int sdst_ = ?,
- string opStr = "",
- list<dag> pattern = []> {
- let hasSideEffects = 1 in {
- def "" : SMRD_Pseudo <opName, outs, (ins), pattern>;
+ (ins exp_tgt:$tgt,
+ ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
+ exp_vm:$vm, exp_compr:$compr, i8imm:$en),
+ "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm",
+ [(node (i8 timm:$en), (i1 timm:$vm), (i8 timm:$tgt), (i1 timm:$compr),
+ f32:$src0, f32:$src1, f32:$src2, f32:$src3)]> {
+ let AsmMatchConverter = "cvtExp";
+}
+
+// Split EXP instruction into EXP and EXP_DONE so we can set
+// mayLoad for done=1.
+multiclass EXP_m<bit done, SDPatternOperator node> {
+ let mayLoad = done in {
+ let isPseudo = 1, isCodeGenOnly = 1 in {
+ def "" : EXP_Helper<done, node>,
+ SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>;
+ }
- let sbase = 0, soff = 0, sdst = sdst_ in {
- def _si : SMRD_SOFF_Real_si <op.SI, opName, outs, (ins), opName#opStr>;
+ let done = done in {
+ def _si : EXP_Helper<done>,
+ SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>,
+ EXPe {
+ let DecoderNamespace = "SICI";
+ let DisableDecoder = DisableSIDecoder;
+ }
- let glc = 0 in {
- def _vi : SMRD_SOFF_Real_vi <op.VI, opName, outs, (ins), opName#opStr>;
+ def _vi : EXP_Helper<done>,
+ SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>,
+ EXPe_vi {
+ let DecoderNamespace = "VI";
+ let DisableDecoder = DisableVIDecoder;
}
}
}
}
-multiclass SMRD_Inval <smrd op, string opName,
- SDPatternOperator node> {
- let mayStore = 1 in {
- defm : SMRD_Special<op, opName, (outs), 0, "", [(node)]>;
- }
-}
-
-class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> :
- SMRD_SOFF_Real_vi<op, opName, (outs), (ins), opName, [(node)]> {
- let hasSideEffects = 1;
- let mayStore = 1;
- let sbase = 0;
- let sdst = 0;
- let glc = 0;
- let soff = 0;
-}
-
-class SMEM_Ret <bits<8> op, string opName, SDPatternOperator node> :
- SMRD_SOFF_Real_vi<op, opName, (outs SReg_64:$sdst), (ins),
- opName#" $sdst", [(set i64:$sdst, (node))]> {
- let hasSideEffects = 1;
- let mayStore = ?;
- let mayLoad = ?;
- let sbase = 0;
- let glc = 0;
- let soff = 0;
-}
-
-multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass,
- RegisterClass dstClass> {
- defm _IMM : SMRD_IMM_m <
- op, opName#"_IMM", (outs dstClass:$sdst),
- (ins baseClass:$sbase, smrd_offset:$offset),
- opName#" $sdst, $sbase, $offset", []
- >;
-
- def _IMM_ci : SMRD <
- (outs dstClass:$sdst), (ins baseClass:$sbase, smrd_literal_offset:$offset),
- opName#" $sdst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> {
- let AssemblerPredicates = [isCIOnly];
- let DecoderNamespace = "CI";
- }
-
- defm _SGPR : SMRD_SOFF_m <
- op, opName#"_SGPR", (outs dstClass:$sdst),
- (ins baseClass:$sbase, SReg_32:$soff),
- opName#" $sdst, $sbase, $soff", []
- >;
-}
-
//===----------------------------------------------------------------------===//
// Vector ALU classes
//===----------------------------------------------------------------------===//
@@ -1146,43 +696,99 @@ class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
// instructions for the given VT.
class getVALUDstForVT<ValueType VT> {
RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
- !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
- !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
- VOPDstOperand<SReg_64>))); // else VT == i1
+ !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>,
+ !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
+ !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
+ VOPDstOperand<SReg_64>)))); // else VT == i1
}
// Returns the register class to use for source 0 of VOP[12C]
// instructions for the given VT.
class getVOPSrc0ForVT<ValueType VT> {
- RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32);
+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ !if(!eq(VT.Value, f64.Value), 1,
+ 0)));
+ RegisterOperand ret = !if(isFP,
+ !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)),
+ !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32)));
}
// Returns the vreg register class to use for source operand given VT
class getVregSrcForVT<ValueType VT> {
- RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32);
+ RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
+ !if(!eq(VT.Size, 64), VReg_64, VGPR_32));
}
// Returns the register class to use for sources of VOP3 instructions for the
// given VT.
class getVOP3SrcForVT<ValueType VT> {
+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ !if(!eq(VT.Value, f64.Value), 1,
+ 0)));
RegisterOperand ret =
- !if(!eq(VT.Size, 64),
- VCSrc_64,
- !if(!eq(VT.Value, i1.Value),
- SCSrc_64,
- VCSrc_32
- )
- );
+ !if(!eq(VT.Size, 128),
+ VSrc_128,
+ !if(!eq(VT.Size, 64),
+ !if(isFP,
+ VCSrc_f64,
+ VCSrc_b64),
+ !if(!eq(VT.Value, i1.Value),
+ SCSrc_b64,
+ !if(isFP,
+ !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32),
+ !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32)
+ )
+ )
+ )
+ );
}
// Returns 1 if the source arguments have modifiers, 0 if they do not.
// XXX - do f16 instructions?
-class hasModifiers<ValueType SrcVT> {
+class isFloatType<ValueType SrcVT> {
bit ret =
+ !if(!eq(SrcVT.Value, f16.Value), 1,
!if(!eq(SrcVT.Value, f32.Value), 1,
!if(!eq(SrcVT.Value, f64.Value), 1,
- 0));
+ 0)));
+}
+
+class isIntType<ValueType SrcVT> {
+ bit ret =
+ !if(!eq(SrcVT.Value, i16.Value), 1,
+ !if(!eq(SrcVT.Value, i32.Value), 1,
+ !if(!eq(SrcVT.Value, i64.Value), 1,
+ 0)));
+}
+
+
+// Return type of input modifiers operand for specified input operand
+class getSrcMod <ValueType VT> {
+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ !if(!eq(VT.Value, f64.Value), 1,
+ 0)));
+ Operand ret = !if(!eq(VT.Size, 64),
+ !if(isFP, FP64InputMods, Int64InputMods),
+ !if(isFP,
+ !if(!eq(VT.Value, f16.Value),
+ FP16InputMods,
+ FP32InputMods
+ ),
+ Int32InputMods)
+ );
+}
+
+// Return type of input modifiers operand specified input operand for SDWA/DPP
+class getSrcModExt <ValueType VT> {
+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ !if(!eq(VT.Value, f64.Value), 1,
+ 0)));
+ Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
}
// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
@@ -1195,7 +801,8 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
// Returns the input arguments for VOP3 instructions for the given SrcVT.
class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
RegisterOperand Src2RC, int NumSrcArgs,
- bit HasModifiers> {
+ bit HasModifiers, Operand Src0Mod, Operand Src1Mod,
+ Operand Src2Mod> {
dag ret =
!if (!eq(NumSrcArgs, 0),
@@ -1205,7 +812,7 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
!if (!eq(NumSrcArgs, 1),
!if (!eq(HasModifiers, 1),
// VOP1 with modifiers
- (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod:$clamp, omod:$omod)
/* else */,
// VOP1 without modifiers
@@ -1214,8 +821,8 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
!if (!eq(NumSrcArgs, 2),
!if (!eq(HasModifiers, 1),
// VOP 2 with modifiers
- (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
- FPInputMods:$src1_modifiers, Src1RC:$src1,
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
clampmod:$clamp, omod:$omod)
/* else */,
// VOP2 without modifiers
@@ -1224,9 +831,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
/* NumSrcArgs == 3 */,
!if (!eq(HasModifiers, 1),
// VOP3 with modifiers
- (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
- FPInputMods:$src1_modifiers, Src1RC:$src1,
- FPInputMods:$src2_modifiers, Src2RC:$src2,
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2Mod:$src2_modifiers, Src2RC:$src2,
clampmod:$clamp, omod:$omod)
/* else */,
// VOP3 without modifiers
@@ -1235,7 +842,7 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
}
class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
- bit HasModifiers> {
+ bit HasModifiers, Operand Src0Mod, Operand Src1Mod> {
dag ret = !if (!eq(NumSrcArgs, 0),
// VOP1 without input operands (V_NOP)
@@ -1244,7 +851,7 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
!if (!eq(NumSrcArgs, 1),
!if (!eq(HasModifiers, 1),
// VOP1_DPP with modifiers
- (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
/* else */,
@@ -1255,8 +862,8 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
/* NumSrcArgs == 2 */,
!if (!eq(HasModifiers, 1),
// VOP2_DPP with modifiers
- (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
- FPInputMods:$src1_modifiers, Src1RC:$src1,
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
/* else */,
@@ -1268,49 +875,28 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
}
class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
- bit HasFloatModifiers, ValueType DstVT> {
+ bit HasFloatModifiers, Operand Src0Mod, Operand Src1Mod,
+ ValueType DstVT> {
dag ret = !if(!eq(NumSrcArgs, 0),
// VOP1 without input operands (V_NOP)
(ins),
!if(!eq(NumSrcArgs, 1),
- !if(HasFloatModifiers,
- // VOP1_SDWA with float modifiers
- (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0,
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel)
- /* else */,
- // VOP1_SDWA with sext modifier
- (ins IntInputMods:$src0_imodifiers, Src0RC:$src0,
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel)
- /* endif */)
- /* NumSrcArgs == 2 */,
- !if(HasFloatModifiers,
- !if(!eq(DstVT.Size, 1),
- // VOPC_SDWA with float modifiers
- (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0,
- FPInputMods:$src1_fmodifiers, Src1RC:$src1,
- clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
- // VOP2_SDWA or VOPC_SDWA with float modifiers
- (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0,
- FPInputMods:$src1_fmodifiers, Src1RC:$src1,
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel, src1_sel:$src1_sel)
- ),
- /* else */
- !if(!eq(DstVT.Size, 1),
- // VOPC_SDWA with sext modifiers
- (ins IntInputMods:$src0_imodifiers, Src0RC:$src0,
- IntInputMods:$src1_imodifiers, Src1RC:$src1,
- clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
- // VOP2_SDWA or VOPC_SDWA with sext modifier
- (ins IntInputMods:$src0_imodifiers, Src0RC:$src0,
- IntInputMods:$src1_imodifiers, Src1RC:$src1,
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel, src1_sel:$src1_sel)
- )
- /* endif */)));
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel),
+ !if(!eq(NumSrcArgs, 2),
+ !if(!eq(DstVT.Size, 1),
+ // VOPC_SDWA with modifiers
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
+ // VOP2_SDWA or VOPC_SDWA with modifiers
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel, src1_sel:$src1_sel)),
+ (ins)/* endif */)));
}
// Outs for DPP and SDWA
@@ -1374,8 +960,8 @@ class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,
" vcc", // use vcc token as dst for VOPC instructioins
"$vdst"),
"");
- string src0 = !if(HasFloatModifiers, "$src0_fmodifiers", "$src0_imodifiers");
- string src1 = !if(HasFloatModifiers, "$src1_fmodifiers", "$src1_imodifiers");
+ string src0 = "$src0_modifiers";
+ string src1 = "$src1_modifiers";
string args = !if(!eq(NumSrcArgs, 0),
"",
!if(!eq(NumSrcArgs, 1),
@@ -1414,6 +1000,14 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
);
}
+class BitOr<bit a, bit b> {
+ bit ret = !if(a, 1, !if(b, 1, 0));
+}
+
+class BitAnd<bit a, bit b> {
+ bit ret = !if(a, !if(b, 1, 0), 0);
+}
+
class VOPProfile <list<ValueType> _ArgVT> {
field list<ValueType> ArgVT = _ArgVT;
@@ -1434,11 +1028,41 @@ class VOPProfile <list<ValueType> _ArgVT> {
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;
field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;
+ field Operand Src0Mod = getSrcMod<Src0VT>.ret;
+ field Operand Src1Mod = getSrcMod<Src1VT>.ret;
+ field Operand Src2Mod = getSrcMod<Src2VT>.ret;
+ field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
+ field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
+ field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
+ field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
+
field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
field bit HasDst32 = HasDst;
+ field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
- field bit HasModifiers = hasModifiers<Src0VT>.ret;
+ field bit HasSrc0 = !if(!eq(Src0VT.Value, untyped.Value), 0, 1);
+ field bit HasSrc1 = !if(!eq(Src1VT.Value, untyped.Value), 0, 1);
+ field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1);
+
+ // TODO: Modifiers logic is somewhat adhoc here, to be refined later
+ field bit HasModifiers = isFloatType<Src0VT>.ret;
+
+ field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret;
+ field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret;
+ field bit HasSrc2FloatMods = isFloatType<Src2VT>.ret;
+
+ field bit HasSrc0IntMods = isIntType<Src0VT>.ret;
+ field bit HasSrc1IntMods = isIntType<Src1VT>.ret;
+ field bit HasSrc2IntMods = isIntType<Src2VT>.ret;
+
+ field bit HasSrc0Mods = HasModifiers;
+ field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);
+ field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
+
+ field bit HasOMod = HasModifiers;
+ field bit HasClamp = HasModifiers;
+ field bit HasSDWAClamp = HasSrc0;
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
@@ -1449,13 +1073,16 @@ class VOPProfile <list<ValueType> _ArgVT> {
field dag Outs32 = Outs;
field dag Outs64 = Outs;
field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
- field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+ field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
- HasModifiers>.ret;
- field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs, HasModifiers>.ret;
- field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, HasModifiers, DstVT>.ret;
+ HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
+ field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
+ HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
+ field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
+ HasModifiers, Src0ModSDWA, Src1ModSDWA,
+ DstVT>.ret;
field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
@@ -1467,14 +1094,13 @@ class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
let HasExt = 0;
}
-// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order
-// for the instruction patterns to work.
def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
-def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>;
-def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>;
+def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
+def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
-def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>;
+def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
+def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
@@ -1492,6 +1118,7 @@ def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
+def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>;
def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>;
@@ -1500,181 +1127,21 @@ def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
-// Write out to vcc or arbitrary SGPR.
-def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
- let Asm32 = "$vdst, vcc, $src0, $src1";
- let Asm64 = "$vdst, $sdst, $src0, $src1";
- let Outs32 = (outs DstRC:$vdst);
- let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
-}
-
-// Write out to vcc or arbitrary SGPR and read in from vcc or
-// arbitrary SGPR.
-def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
- // We use VCSrc_32 to exclude literal constants, even though the
- // encoding normally allows them since the implicit VCC use means
- // using one would always violate the constant bus
- // restriction. SGPRs are still allowed because it should
- // technically be possible to use VCC again as src0.
- let Src0RC32 = VCSrc_32;
- let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
- let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
- let Outs32 = (outs DstRC:$vdst);
- let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
-
- // Suppress src2 implied by type since the 32-bit encoding uses an
- // implicit VCC use.
- let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
-}
-
-// Read in from vcc or arbitrary SGPR
-def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
- let Src0RC32 = VCSrc_32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
- let Asm32 = "$vdst, $src0, $src1, vcc";
- let Asm64 = "$vdst, $src0, $src1, $src2";
- let Outs32 = (outs DstRC:$vdst);
- let Outs64 = (outs DstRC:$vdst);
-
- // Suppress src2 implied by type since the 32-bit encoding uses an
- // implicit VCC use.
- let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
-}
-
-class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
- let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
- let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod";
-}
-
-def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
- // FIXME: Hack to stop printing _e64
- let DstRC = RegisterOperand<VGPR_32>;
-}
-
-def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
- // FIXME: Hack to stop printing _e64
- let DstRC = RegisterOperand<VReg_64>;
-}
-
-// VOPC instructions are a special case because for the 32-bit
-// encoding, we want to display the implicit vcc write as if it were
-// an explicit $dst.
-class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> {
- let Asm32 = "vcc, $src0, $src1";
- // The destination for 32-bit encoding is implicit.
- let HasDst32 = 0;
- let Outs64 = (outs DstRC:$sdst);
-}
-
-class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> {
- let Ins64 = (ins FPInputMods:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
- let Asm64 = "$sdst, $src0_modifiers, $src1";
- let InsSDWA = (ins FPInputMods:$src0_fmodifiers, Src0RC64:$src0,
- IntInputMods:$src1_imodifiers, Src1RC64:$src1,
- clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
- let AsmSDWA = " vcc, $src0_fmodifiers, $src1_imodifiers$clamp $src0_sel $src1_sel";
-
-}
-
-def VOPC_I1_F32_F32 : VOPC_Profile<f32>;
-def VOPC_I1_F64_F64 : VOPC_Profile<f64>;
-def VOPC_I1_I32_I32 : VOPC_Profile<i32>;
-def VOPC_I1_I64_I64 : VOPC_Profile<i64>;
-
-def VOPC_I1_F32_I32 : VOPC_Class_Profile<f32>;
-def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>;
-
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
+def VOP_F16_F32_F16_F32 : VOPProfile <[f16, f32, f16, f32]>;
+def VOP_F32_F32_F16_F16 : VOPProfile <[f32, f32, f16, f16]>;
def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
-def VOP_MADAK : VOPProfile <[f32, f32, f32, f32]> {
- field dag Ins32 = (ins VCSrc_32:$src0, VGPR_32:$src1, u32kimm:$imm);
- field string Asm32 = "$vdst, $src0, $src1, $imm";
- field bit HasExt = 0;
-}
-def VOP_MADMK : VOPProfile <[f32, f32, f32, f32]> {
- field dag Ins32 = (ins VCSrc_32:$src0, u32kimm:$imm, VGPR_32:$src1);
- field string Asm32 = "$vdst, $src0, $imm, $src1";
- field bit HasExt = 0;
-}
-def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> {
- let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
- let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
- HasModifiers>.ret;
- let InsDPP = (ins FPInputMods:$src0_modifiers, Src0RC32:$src0,
- FPInputMods:$src1_modifiers, Src1RC32:$src1,
- VGPR_32:$src2, // stub argument
- dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
- bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
- let InsSDWA = (ins FPInputMods:$src0_fmodifiers, Src0RC32:$src0,
- FPInputMods:$src1_fmodifiers, Src1RC32:$src1,
- VGPR_32:$src2, // stub argument
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel, src1_sel:$src1_sel);
- let Asm32 = getAsm32<1, 2, f32>.ret;
- let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret;
- let AsmDPP = getAsmDPP<1, 2, HasModifiers, f32>.ret;
- let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, f32>.ret;
-}
def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
+def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
+def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
+def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
-// This class is used only with VOPC instructions. Use $sdst for out operand
-class SIInstAlias <string asm, Instruction inst, VOPProfile p> :
- InstAlias <asm, (inst)>, PredicateControl {
-
- field bit isCompare;
- field bit isCommutable;
-
- let ResultInst =
- !if (p.HasDst32,
- !if (!eq(p.NumSrcArgs, 0),
- // 1 dst, 0 src
- (inst p.DstRC:$sdst),
- !if (!eq(p.NumSrcArgs, 1),
- // 1 dst, 1 src
- (inst p.DstRC:$sdst, p.Src0RC32:$src0),
- !if (!eq(p.NumSrcArgs, 2),
- // 1 dst, 2 src
- (inst p.DstRC:$sdst, p.Src0RC32:$src0, p.Src1RC32:$src1),
- // else - unreachable
- (inst)))),
- // else
- !if (!eq(p.NumSrcArgs, 2),
- // 0 dst, 2 src
- (inst p.Src0RC32:$src0, p.Src1RC32:$src1),
- !if (!eq(p.NumSrcArgs, 1),
- // 0 dst, 1 src
- (inst p.Src0RC32:$src1),
- // else
- // 0 dst, 0 src
- (inst))));
-}
-
-class SIInstAliasSI <string asm, string op_name, VOPProfile p> :
- SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_si"), p> {
- let AssemblerPredicate = SIAssemblerPredicate;
-}
-
-class SIInstAliasVI <string asm, string op_name, VOPProfile p> :
- SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_vi"), p> {
- let AssemblerPredicates = [isVI];
-}
-
-multiclass SIInstAliasBuilder <string asm, VOPProfile p> {
-
- def : SIInstAliasSI <asm, NAME, p>;
-
- def : SIInstAliasVI <asm, NAME, p>;
-}
-
-class VOP <string opName> {
- string OpName = opName;
-}
-
-class VOP2_REV <string revOp, bit isOrig> {
+class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
bit IsOrig = isOrig;
}
@@ -1684,832 +1151,6 @@ class AtomicNoRet <string noRetOp, bit isRet> {
bit IsRet = isRet;
}
-class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
- VOP1Common <outs, ins, "", pattern>,
- VOP <opName>,
- SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>,
- MnemonicAlias<opName#"_e32", opName> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
-
- field bits<8> vdst;
- field bits<9> src0;
-}
-
-class VOP1_Real_si <string opName, vop1 op, dag outs, dag ins, string asm> :
- VOP1<op.SI, outs, ins, asm, []>,
- SIMCInstr <opName#"_e32", SIEncodingFamily.SI> {
- let AssemblerPredicate = SIAssemblerPredicate;
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> :
- VOP1<op.VI, outs, ins, asm, []>,
- SIMCInstr <opName#"_e32", SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
- string asm = opName#p.Asm32> {
- def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>;
-
- def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>;
-
- def _vi : VOP1_Real_vi <opName, op, p.Outs, p.Ins32, asm>;
-
-}
-
-class VOP1_DPP <vop1 op, string opName, VOPProfile p> :
- VOP1_DPPe <op.VI>,
- VOP_DPP <p.OutsDPP, p.InsDPP, opName#p.AsmDPP, [], p.HasModifiers> {
- let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
- let DecoderNamespace = "DPP";
- let DisableDecoder = DisableVIDecoder;
- let src0_modifiers = !if(p.HasModifiers, ?, 0);
- let src1_modifiers = 0;
-}
-
-class SDWADisableFields <VOPProfile p> {
- bits<8> src0 = !if(!eq(p.NumSrcArgs, 0), 0, ?);
- bits<3> src0_sel = !if(!eq(p.NumSrcArgs, 0), 6, ?);
- bits<2> src0_fmodifiers = !if(!eq(p.NumSrcArgs, 0),
- 0,
- !if(p.HasModifiers, ?, 0));
- bits<1> src0_imodifiers = !if(!eq(p.NumSrcArgs, 0),
- 0,
- !if(p.HasModifiers, 0, ?));
- bits<3> src1_sel = !if(!eq(p.NumSrcArgs, 0), 6,
- !if(!eq(p.NumSrcArgs, 1), 6,
- ?));
- bits<2> src1_fmodifiers = !if(!eq(p.NumSrcArgs, 0), 0,
- !if(!eq(p.NumSrcArgs, 1), 0,
- !if(p.HasModifiers, ?, 0)));
- bits<1> src1_imodifiers = !if(!eq(p.NumSrcArgs, 0), 0,
- !if(!eq(p.NumSrcArgs, 1), 0,
- !if(p.HasModifiers, 0, ?)));
- bits<3> dst_sel = !if(p.HasDst, ?, 6);
- bits<2> dst_unused = !if(p.HasDst, ?, 2);
- bits<1> clamp = !if(!eq(p.NumSrcArgs, 0), 0, ?);
-}
-
-class VOP1_SDWA <vop1 op, string opName, VOPProfile p> :
- VOP1_SDWAe <op.VI>,
- VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>,
- SDWADisableFields <p> {
- let AsmMatchConverter = "cvtSdwaVOP1";
- let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
- let DecoderNamespace = "SDWA";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
- string asm = opName#p.Asm32> {
-
- def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>;
-
- def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>;
-}
-
-class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
- VOP2Common <outs, ins, "", pattern>,
- VOP <opName>,
- SIMCInstr<opName#"_e32", SIEncodingFamily.NONE>,
- MnemonicAlias<opName#"_e32", opName> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
-}
-
-class VOP2_Real_si <string opName, vop2 op, dag outs, dag ins, string asm> :
- VOP2 <op.SI, outs, ins, opName#asm, []>,
- SIMCInstr <opName#"_e32", SIEncodingFamily.SI> {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> :
- VOP2 <op.VI, outs, ins, opName#asm, []>,
- SIMCInstr <opName#"_e32", SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern,
- string revOp> {
-
- def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>,
- VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
-
- def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>;
-}
-
-multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern,
- string revOp> {
-
- def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>,
- VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
-
- def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>;
-
- def _vi : VOP2_Real_vi <opName, op, p.Outs32, p.Ins32, p.Asm32>;
-
-}
-
-class VOP2_DPP <vop2 op, string opName, VOPProfile p> :
- VOP2_DPPe <op.VI>,
- VOP_DPP <p.OutsDPP, p.InsDPP, opName#p.AsmDPP, [], p.HasModifiers> {
- let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
- let DecoderNamespace = "DPP";
- let DisableDecoder = DisableVIDecoder;
- let src0_modifiers = !if(p.HasModifiers, ?, 0);
- let src1_modifiers = !if(p.HasModifiers, ?, 0);
-}
-
-class VOP2_SDWA <vop2 op, string opName, VOPProfile p> :
- VOP2_SDWAe <op.VI>,
- VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>,
- SDWADisableFields <p> {
- let AsmMatchConverter = "cvtSdwaVOP2";
- let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
- let DecoderNamespace = "SDWA";
- let DisableDecoder = DisableVIDecoder;
-}
-
-class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> {
-
- bits<2> src0_modifiers = !if(HasModifiers, ?, 0);
- bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0);
- bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0);
- bits<2> omod = !if(HasModifiers, ?, 0);
- bits<1> clamp = !if(HasModifiers, ?, 0);
- bits<9> src1 = !if(HasSrc1, ?, 0);
- bits<9> src2 = !if(HasSrc2, ?, 0);
-}
-
-class VOP3DisableModFields <bit HasSrc0Mods,
- bit HasSrc1Mods = 0,
- bit HasSrc2Mods = 0,
- bit HasOutputMods = 0> {
- bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0);
- bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0);
- bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0);
- bits<2> omod = !if(HasOutputMods, ?, 0);
- bits<1> clamp = !if(HasOutputMods, ?, 0);
-}
-
-class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName,
- bit HasMods = 0, bit VOP3Only = 0> :
- VOP3Common <outs, ins, "", pattern, HasMods, VOP3Only>,
- VOP <opName>,
- SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
- MnemonicAlias<opName#"_e64", opName> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
-
- field bit vdst;
- field bit src0;
-}
-
-class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName,
- bit HasMods = 0, bit VOP3Only = 0> :
- VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
- VOP3e <op>,
- SIMCInstr<opName#"_e64", SIEncodingFamily.SI> {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName,
- bit HasMods = 0, bit VOP3Only = 0> :
- VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
- VOP3e_vi <op>,
- SIMCInstr <opName#"_e64", SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-class VOP3_C_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName,
- bit HasMods = 0, bit VOP3Only = 0> :
- VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
- VOP3ce <op>,
- SIMCInstr<opName#"_e64", SIEncodingFamily.SI> {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class VOP3_C_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName,
- bit HasMods = 0, bit VOP3Only = 0> :
- VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
- VOP3ce_vi <op>,
- SIMCInstr <opName#"_e64", SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName,
- bit HasMods = 0, bit VOP3Only = 0> :
- VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
- VOP3be <op>,
- SIMCInstr<opName#"_e64", SIEncodingFamily.SI> {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName,
- bit HasMods = 0, bit VOP3Only = 0> :
- VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
- VOP3be_vi <op>,
- SIMCInstr <opName#"_e64", SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-class VOP3e_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName,
- bit HasMods = 0, bit VOP3Only = 0> :
- VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
- VOP3e <op>,
- SIMCInstr<opName#"_e64", SIEncodingFamily.SI> {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class VOP3e_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName,
- bit HasMods = 0, bit VOP3Only = 0> :
- VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
- VOP3e_vi <op>,
- SIMCInstr <opName#"_e64", SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName, int NumSrcArgs, bit HasMods = 1, bit VOP3Only = 0> {
-
- def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
-
- def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>,
- VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
- !if(!eq(NumSrcArgs, 2), 0, 1),
- HasMods>;
- def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>,
- VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
- !if(!eq(NumSrcArgs, 2), 0, 1),
- HasMods>;
-}
-
-multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName, bit HasMods = 1> {
-
- def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>;
-
- def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
- VOP3DisableFields<0, 0, HasMods>;
-
- def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>,
- VOP3DisableFields<0, 0, HasMods>;
-}
-
-multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName, bit HasMods = 1> {
-
- def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>;
-
- def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
- VOP3DisableFields<0, 0, HasMods>;
- // No VI instruction. This class is for SI only.
-}
-
-multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName, string revOp,
- bit HasMods = 1> {
-
- def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>,
- VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
-
- def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
- VOP3DisableFields<1, 0, HasMods>;
-
- def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>,
- VOP3DisableFields<1, 0, HasMods>;
-}
-
-multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName, string revOp,
- bit HasMods = 1> {
-
- def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>,
- VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
-
- def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
- VOP3DisableFields<1, 0, HasMods>;
-
- // No VI instruction. This class is for SI only.
-}
-
-// Two operand VOP3b instruction that may have a 3rd SGPR bool operand
-// instead of an implicit VCC as in the VOP2b format.
-multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName, string revOp,
- bit HasMods = 1, bit useSrc2Input = 0, bit VOP3Only = 0> {
- def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods, VOP3Only>;
-
- def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>,
- VOP3DisableFields<1, useSrc2Input, HasMods>;
-
- def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>,
- VOP3DisableFields<1, useSrc2Input, HasMods>;
-}
-
-// Same as VOP3b_2_3_m but no 2nd destination (sdst), e.g. v_cndmask_b32.
-multiclass VOP3e_2_3_m <vop op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName, string revOp,
- bit HasMods = 1, bit useSrc2Input = 0, bit VOP3Only = 0> {
- def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods, VOP3Only>;
-
- def _si : VOP3e_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>,
- VOP3DisableFields<1, useSrc2Input, HasMods>;
-
- def _vi : VOP3e_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>,
- VOP3DisableFields<1, useSrc2Input, HasMods>;
-}
-
-multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName,
- bit HasMods, bit defExec,
- string revOp, list<SchedReadWrite> sched> {
-
- def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>,
- VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
- let Defs = !if(defExec, [EXEC], []);
- let SchedRW = sched;
- }
-
- def _si : VOP3_C_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
- VOP3DisableFields<1, 0, HasMods> {
- let Defs = !if(defExec, [EXEC], []);
- let SchedRW = sched;
- }
-
- def _vi : VOP3_C_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>,
- VOP3DisableFields<1, 0, HasMods> {
- let Defs = !if(defExec, [EXEC], []);
- let SchedRW = sched;
- }
-}
-
-// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers.
-multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins,
- string asm, list<dag> pattern = []> {
- let isPseudo = 1, isCodeGenOnly = 1 in {
- def "" : VOPAnyCommon <outs, ins, "", pattern>,
- SIMCInstr<opName, SIEncodingFamily.NONE>;
- }
-
- def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>,
- SIMCInstr <opName, SIEncodingFamily.SI> {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
- }
-
- def _vi : VOP3Common <outs, ins, asm, []>,
- VOP3e_vi <op.VI3>,
- VOP3DisableFields <1, 0, 0>,
- SIMCInstr <opName, SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
- }
-}
-
-multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32,
- list<dag> pat64> {
-
- defm _e32 : VOP1_m <op, opName, p, pat32>;
-
- defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
- p.HasModifiers>;
-
- def _dpp : VOP1_DPP <op, opName, p>;
-
- def _sdwa : VOP1_SDWA <op, opName, p>;
-}
-
-multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
- SDPatternOperator node = null_frag> : VOP1_Helper <
- op, opName, P, [],
- !if(P.HasModifiers,
- [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
- i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
- [(set P.DstVT:$vdst, (node P.Src0VT:$src0))])
->;
-
-multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
- SDPatternOperator node = null_frag> {
-
- defm _e32 : VOP1SI_m <op, opName, P, []>;
-
- defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64,
- !if(P.HasModifiers,
- [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
- i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
- [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]),
- opName, P.HasModifiers>;
-}
-
-multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32,
- list<dag> pat64, string revOp> {
-
- defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
-
- defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
- revOp, p.HasModifiers>;
-
- def _dpp : VOP2_DPP <op, opName, p>;
-
- def _sdwa : VOP2_SDWA <op, opName, p>;
-}
-
-multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
- SDPatternOperator node = null_frag,
- string revOp = opName> : VOP2_Helper <
- op, opName, P, [],
- !if(P.HasModifiers,
- [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
- i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
- [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
- revOp
->;
-
-multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
- SDPatternOperator node = null_frag,
- string revOp = opName> {
-
- defm _e32 : VOP2SI_m <op, opName, P, [], revOp>;
-
- defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64,
- !if(P.HasModifiers,
- [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
- i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
- [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
- opName, revOp, P.HasModifiers>;
-}
-
-multiclass VOP2e_Helper <vop2 op, string opName, VOPProfile p,
- list<dag> pat32, list<dag> pat64,
- string revOp, bit useSGPRInput> {
-
- let SchedRW = [Write32Bit] in {
- let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
- defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
- }
-
- defm _e64 : VOP3e_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64,
- opName, revOp, p.HasModifiers, useSGPRInput>;
- }
-}
-
-multiclass VOP2eInst <vop2 op, string opName, VOPProfile P,
- SDPatternOperator node = null_frag,
- string revOp = opName> : VOP2e_Helper <
- op, opName, P, [],
- !if(P.HasModifiers,
- [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
- i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
- [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
- revOp, !eq(P.NumSrcArgs, 3)
->;
-
-multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p,
- list<dag> pat32, list<dag> pat64,
- string revOp, bit useSGPRInput> {
-
- let SchedRW = [Write32Bit, WriteSALU] in {
- let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
- defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
- }
-
- defm _e64 : VOP3b_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64,
- opName, revOp, p.HasModifiers, useSGPRInput>;
- }
-}
-
-multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
- SDPatternOperator node = null_frag,
- string revOp = opName> : VOP2b_Helper <
- op, opName, P, [],
- !if(P.HasModifiers,
- [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
- i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
- [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
- revOp, !eq(P.NumSrcArgs, 3)
->;
-
-// A VOP2 instruction that is VOP3-only on VI.
-multiclass VOP2_VI3_Helper <vop23 op, string opName, VOPProfile p,
- list<dag> pat32, list<dag> pat64, string revOp> {
-
- defm _e32 : VOP2SI_m <op, opName, p, pat32, revOp>;
-
- defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
- revOp, p.HasModifiers>;
-}
-
-multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P,
- SDPatternOperator node = null_frag,
- string revOp = opName>
- : VOP2_VI3_Helper <
- op, opName, P, [],
- !if(P.HasModifiers,
- [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
- i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
- [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
- revOp
->;
-
-multiclass VOP2MADK <vop2 op, string opName, VOPProfile P, list<dag> pattern = []> {
-
- def "" : VOP2_Pseudo <P.Outs, P.Ins32, pattern, opName>;
-
-let isCodeGenOnly = 0 in {
- def _si : VOP2Common <P.Outs, P.Ins32,
- !strconcat(opName, P.Asm32), []>,
- SIMCInstr <opName#"_e32", SIEncodingFamily.SI>,
- VOP2_MADKe <op.SI> {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
- }
-
- def _vi : VOP2Common <P.Outs, P.Ins32,
- !strconcat(opName, P.Asm32), []>,
- SIMCInstr <opName#"_e32", SIEncodingFamily.VI>,
- VOP2_MADKe <op.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
- }
-} // End isCodeGenOnly = 0
-}
-
-class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> :
- VOPCCommon <ins, "", pattern>,
- VOP <opName>,
- SIMCInstr<opName#"_e32", SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
-}
-
-class VOPC_SDWA <vopc op, string opName, bit DefExec, VOPProfile p> :
- VOPC_SDWAe <op.VI>,
- VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>,
- SDWADisableFields <p> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
- let hasSideEffects = DefExec;
- let AsmMatchConverter = "cvtSdwaVOPC";
- let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
- let DecoderNamespace = "SDWA";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern,
- string opName, bit DefExec, VOPProfile p,
- list<SchedReadWrite> sched,
- string revOpName = "", string asm = opName#"_e32 "#op_asm,
- string alias_asm = opName#" "#op_asm> {
- def "" : VOPC_Pseudo <ins, pattern, opName>,
- VOP2_REV<revOpName#"_e32", !eq(revOpName, opName)> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
- let SchedRW = sched;
- let isConvergent = DefExec;
- }
-
- let AssemblerPredicates = [isSICI] in {
- def _si : VOPC<op.SI, ins, asm, []>,
- SIMCInstr <opName#"_e32", SIEncodingFamily.SI> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
- let isConvergent = DefExec;
- let SchedRW = sched;
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
- }
-
- } // End AssemblerPredicates = [isSICI]
-
- let AssemblerPredicates = [isVI] in {
- def _vi : VOPC<op.VI, ins, asm, []>,
- SIMCInstr <opName#"_e32", SIEncodingFamily.VI> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
- let isConvergent = DefExec;
- let SchedRW = sched;
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
- }
-
- } // End AssemblerPredicates = [isVI]
-
- defm : SIInstAliasBuilder<alias_asm, p>;
-}
-
-multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32,
- list<dag> pat64, bit DefExec, string revOp,
- VOPProfile p, list<SchedReadWrite> sched> {
- defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched,
- revOp>;
-
- defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$sdst), p.Ins64, opName#p.Asm64, pat64,
- opName, p.HasModifiers, DefExec, revOp, sched>;
-
- def _sdwa : VOPC_SDWA <op, opName, DefExec, p>;
-}
-
-// Special case for class instructions which only have modifiers on
-// the 1st source operand.
-multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32,
- list<dag> pat64, bit DefExec, string revOp,
- VOPProfile p, list<SchedReadWrite> sched> {
- defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
-
- defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$sdst), p.Ins64, opName#p.Asm64, pat64,
- opName, p.HasModifiers, DefExec, revOp, sched>,
- VOP3DisableModFields<1, 0, 0>;
-
- def _sdwa : VOPC_SDWA <op, opName, DefExec, p> {
- let src1_fmodifiers = 0;
- let src1_imodifiers = ?;
- }
-}
-
-multiclass VOPCInst <vopc op, string opName,
- VOPProfile P, PatLeaf cond = COND_NULL,
- string revOp = opName,
- bit DefExec = 0,
- list<SchedReadWrite> sched = [Write32Bit]> :
- VOPC_Helper <
- op, opName, [],
- !if(P.HasModifiers,
- [(set i1:$sdst,
- (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
- i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
- cond))],
- [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
- DefExec, revOp, P, sched
->;
-
-multiclass VOPCClassInst <vopc op, string opName, VOPProfile P,
- bit DefExec = 0,
- list<SchedReadWrite> sched> : VOPC_Class_Helper <
- op, opName, [],
- !if(P.HasModifiers,
- [(set i1:$sdst,
- (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
- [(set i1:$sdst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
- DefExec, opName, P, sched
->;
-
-
-multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
- VOPCInst <op, opName, VOPC_I1_F32_F32, cond, revOp>;
-
-multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
- VOPCInst <op, opName, VOPC_I1_F64_F64, cond, revOp, 0, [WriteDoubleAdd]>;
-
-multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
- VOPCInst <op, opName, VOPC_I1_I32_I32, cond, revOp>;
-
-multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
- VOPCInst <op, opName, VOPC_I1_I64_I64, cond, revOp, 0, [Write64Bit]>;
-
-
-multiclass VOPCX <vopc op, string opName, VOPProfile P,
- PatLeaf cond = COND_NULL,
- list<SchedReadWrite> sched,
- string revOp = "">
- : VOPCInst <op, opName, P, cond, revOp, 1, sched>;
-
-multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> :
- VOPCX <op, opName, VOPC_I1_F32_F32, COND_NULL, [Write32Bit], revOp>;
-
-multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> :
- VOPCX <op, opName, VOPC_I1_F64_F64, COND_NULL, [WriteDoubleAdd], revOp>;
-
-multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> :
- VOPCX <op, opName, VOPC_I1_I32_I32, COND_NULL, [Write32Bit], revOp>;
-
-multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> :
- VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>;
-
-
-multiclass VOPC_CLASS_F32 <vopc op, string opName> :
- VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>;
-
-multiclass VOPCX_CLASS_F32 <vopc op, string opName> :
- VOPCClassInst <op, opName, VOPC_I1_F32_I32, 1, [Write32Bit]>;
-
-multiclass VOPC_CLASS_F64 <vopc op, string opName> :
- VOPCClassInst <op, opName, VOPC_I1_F64_I32, 0, [WriteDoubleAdd]>;
-
-multiclass VOPCX_CLASS_F64 <vopc op, string opName> :
- VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>;
-
-
-multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
- list<dag> pat, int NumSrcArgs, bit HasMods,
- bit VOP3Only = 0> : VOP3_m <
- op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods, VOP3Only
->;
-
-multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
- SDPatternOperator node = null_frag, bit VOP3Only = 0> :
- VOP3_Helper <
- op, opName, (outs P.DstRC.RegClass:$vdst), P.Ins64, P.Asm64,
- !if(!eq(P.NumSrcArgs, 3),
- !if(P.HasModifiers,
- [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
- i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
- (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))],
- [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1,
- P.Src2VT:$src2))]),
- !if(!eq(P.NumSrcArgs, 2),
- !if(P.HasModifiers,
- [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
- i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
- [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))])
- /* P.NumSrcArgs == 1 */,
- !if(P.HasModifiers,
- [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
- i1:$clamp, i32:$omod))))],
- [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]))),
- P.NumSrcArgs, P.HasModifiers, VOP3Only
->;
-
-// Special case for v_div_fmas_{f32|f64}, since it seems to be the
-// only VOP instruction that implicitly reads VCC.
-multiclass VOP3_VCC_Inst <vop3 op, string opName,
- VOPProfile P,
- SDPatternOperator node = null_frag> : VOP3_Helper <
- op, opName,
- (outs P.DstRC.RegClass:$vdst),
- (ins FPInputMods:$src0_modifiers, P.Src0RC64:$src0,
- FPInputMods:$src1_modifiers, P.Src1RC64:$src1,
- FPInputMods:$src2_modifiers, P.Src2RC64:$src2,
- clampmod:$clamp,
- omod:$omod),
- "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod",
- [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
- i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
- (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)),
- (i1 VCC)))],
- 3, 1
->;
-
-multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> :
- VOP3b_2_3_m <
- op, P.Outs64, P.Ins64,
- opName#" "#P.Asm64, pattern,
- opName, "", 1, 1, VOP3Only
->;
-
-class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
- (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))),
- (Inst i32:$src0_modifiers, P.Src0VT:$src0,
- i32:$src1_modifiers, P.Src1VT:$src1,
- i32:$src2_modifiers, P.Src2VT:$src2,
- i1:$clamp,
- i32:$omod)>;
-
//===----------------------------------------------------------------------===//
// Interpolation opcodes
//===----------------------------------------------------------------------===//
@@ -2551,1052 +1192,6 @@ multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
}
//===----------------------------------------------------------------------===//
-// Vector I/O classes
-//===----------------------------------------------------------------------===//
-
-class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
- DS <outs, ins, "", pattern>,
- SIMCInstr <opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
-}
-
-class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
- DS <outs, ins, asm, []>,
- DSe <op>,
- SIMCInstr <opName, SIEncodingFamily.SI> {
- let isCodeGenOnly = 0;
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace="SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
- DS <outs, ins, asm, []>,
- DSe_vi <op>,
- SIMCInstr <opName, SIEncodingFamily.VI> {
- let isCodeGenOnly = 0;
- let AssemblerPredicates = [isVI];
- let DecoderNamespace="VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
- DS_Real_si <op,opName, outs, ins, asm> {
-
- // Single load interpret the 2 i8imm operands as a single i16 offset.
- bits<16> offset;
- let offset0 = offset{7-0};
- let offset1 = offset{15-8};
-}
-
-class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
- DS_Real_vi <op, opName, outs, ins, asm> {
-
- // Single load interpret the 2 i8imm operands as a single i16 offset.
- bits<16> offset;
- let offset0 = offset{7-0};
- let offset1 = offset{15-8};
-}
-
-multiclass DS_1A_RET_ <dsop op, string opName, RegisterClass rc,
- dag outs = (outs rc:$vdst),
- dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds),
- string asm = opName#" $vdst, $addr"#"$offset$gds"> {
-
- def "" : DS_Pseudo <opName, outs, ins, []>;
-
- let data0 = 0, data1 = 0 in {
- def _si : DS_Off16_Real_si <op.SI, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op.VI, opName, outs, ins, asm>;
- }
-}
-
-// TODO: DS_1A_RET can be inherited from DS_1A_RET_ but its not working
-// for some reason. In fact we can remove this class if use dsop everywhere
-multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc,
- dag outs = (outs rc:$vdst),
- dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds),
- string asm = opName#" $vdst, $addr"#"$offset$gds"> {
-
- def "" : DS_Pseudo <opName, outs, ins, []>;
-
- let data0 = 0, data1 = 0 in {
- def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
- }
-}
-
-multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc,
- dag outs = (outs rc:$vdst),
- dag ins = (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1,
- gds:$gds),
- string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> {
-
- def "" : DS_Pseudo <opName, outs, ins, []>;
-
- let data0 = 0, data1 = 0, AsmMatchConverter = "cvtDSOffset01" in {
- def _si : DS_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
- }
-}
-
-multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc,
- dag outs = (outs),
- dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
- string asm = opName#" $addr, $data0"#"$offset$gds"> {
-
- def "" : DS_Pseudo <opName, outs, ins, []>,
- AtomicNoRet<opName, 0>;
-
- let data1 = 0, vdst = 0 in {
- def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
- }
-}
-
-multiclass DS_1A_Off8_NORET <bits<8> op, string opName,
- dag outs = (outs),
- dag ins = (ins VGPR_32:$addr,
- offset0:$offset0, offset1:$offset1, gds:$gds),
- string asm = opName#" $addr $offset0"#"$offset1$gds"> {
-
- def "" : DS_Pseudo <opName, outs, ins, []>;
-
- let data0 = 0, data1 = 0, vdst = 0, AsmMatchConverter = "cvtDSOffset01" in {
- def _si : DS_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
- }
-}
-
-multiclass DS_1A2D_Off8_NORET <bits<8> op, string opName, RegisterClass rc,
- dag outs = (outs),
- dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
- offset0:$offset0, offset1:$offset1, gds:$gds),
- string asm = opName#" $addr, $data0, $data1$offset0$offset1$gds"> {
-
- def "" : DS_Pseudo <opName, outs, ins, []>;
-
- let vdst = 0, AsmMatchConverter = "cvtDSOffset01" in {
- def _si : DS_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
- }
-}
-
-multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
- string noRetOp = "",
- dag outs = (outs rc:$vdst),
- dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
- string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> {
-
- let hasPostISelHook = 1 in {
- def "" : DS_Pseudo <opName, outs, ins, []>,
- AtomicNoRet<noRetOp, 1>;
-
- let data1 = 0 in {
- def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
- }
- }
-}
-
-multiclass DS_1A1D_PERMUTE <bits<8> op, string opName, RegisterClass rc,
- SDPatternOperator node = null_frag,
- dag outs = (outs rc:$vdst),
- dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset),
- string asm = opName#" $vdst, $addr, $data0"#"$offset"> {
-
- let mayLoad = 0, mayStore = 0, isConvergent = 1 in {
- def "" : DS_Pseudo <opName, outs, ins,
- [(set i32:$vdst,
- (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))]>;
-
- let data1 = 0, gds = 0 in {
- def "_vi" : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
- }
- }
-}
-
-multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc,
- string noRetOp = "", dag ins,
- dag outs = (outs rc:$vdst),
- string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> {
-
- let hasPostISelHook = 1 in {
- def "" : DS_Pseudo <opName, outs, ins, []>,
- AtomicNoRet<noRetOp, 1>;
-
- def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
- }
-}
-
-multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc,
- string noRetOp = "", RegisterClass src = rc> :
- DS_1A2D_RET_m <op, asm, rc, noRetOp,
- (ins VGPR_32:$addr, src:$data0, src:$data1,
- offset:$offset, gds:$gds)
->;
-
-multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc,
- string noRetOp = opName,
- dag outs = (outs),
- dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
- offset:$offset, gds:$gds),
- string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> {
-
- def "" : DS_Pseudo <opName, outs, ins, []>,
- AtomicNoRet<noRetOp, 0>;
-
- let vdst = 0 in {
- def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
- }
-}
-
-multiclass DS_0A_RET <bits<8> op, string opName,
- dag outs = (outs VGPR_32:$vdst),
- dag ins = (ins offset:$offset, gds:$gds),
- string asm = opName#" $vdst"#"$offset"#"$gds"> {
-
- let mayLoad = 1, mayStore = 1 in {
- def "" : DS_Pseudo <opName, outs, ins, []>;
-
- let addr = 0, data0 = 0, data1 = 0 in {
- def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
- } // end addr = 0, data0 = 0, data1 = 0
- } // end mayLoad = 1, mayStore = 1
-}
-
-multiclass DS_1A_RET_GDS <bits<8> op, string opName,
- dag outs = (outs VGPR_32:$vdst),
- dag ins = (ins VGPR_32:$addr, offset:$offset),
- string asm = opName#" $vdst, $addr"#"$offset gds"> {
-
- def "" : DS_Pseudo <opName, outs, ins, []>;
-
- let data0 = 0, data1 = 0, gds = 1 in {
- def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
- } // end data0 = 0, data1 = 0, gds = 1
-}
-
-multiclass DS_1A_GDS <bits<8> op, string opName,
- dag outs = (outs),
- dag ins = (ins VGPR_32:$addr),
- string asm = opName#" $addr gds"> {
-
- def "" : DS_Pseudo <opName, outs, ins, []>;
-
- let vdst = 0, data0 = 0, data1 = 0, offset0 = 0, offset1 = 0, gds = 1 in {
- def _si : DS_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
- } // end vdst = 0, data = 0, data1 = 0, gds = 1
-}
-
-multiclass DS_1A <bits<8> op, string opName,
- dag outs = (outs),
- dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds),
- string asm = opName#" $addr"#"$offset"#"$gds"> {
-
- let mayLoad = 1, mayStore = 1 in {
- def "" : DS_Pseudo <opName, outs, ins, []>;
-
- let vdst = 0, data0 = 0, data1 = 0 in {
- def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
- } // let vdst = 0, data0 = 0, data1 = 0
- } // end mayLoad = 1, mayStore = 1
-}
-
-//===----------------------------------------------------------------------===//
-// MTBUF classes
-//===----------------------------------------------------------------------===//
-
-class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
- MTBUF <outs, ins, "", pattern>,
- SIMCInstr<opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
-}
-
-class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins,
- string asm> :
- MTBUF <outs, ins, asm, []>,
- MTBUFe <op>,
- SIMCInstr<opName, SIEncodingFamily.SI> {
- let DecoderNamespace="SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> :
- MTBUF <outs, ins, asm, []>,
- MTBUFe_vi <op>,
- SIMCInstr <opName, SIEncodingFamily.VI> {
- let DecoderNamespace="VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm,
- list<dag> pattern> {
-
- def "" : MTBUF_Pseudo <opName, outs, ins, pattern>;
-
- def _si : MTBUF_Real_si <op, opName, outs, ins, asm>;
-
- def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>;
-
-}
-
-let mayStore = 1, mayLoad = 0 in {
-
-multiclass MTBUF_Store_Helper <bits<3> op, string opName,
- RegisterClass regClass> : MTBUF_m <
- op, opName, (outs),
- (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
- i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
- SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset),
- opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
- #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
->;
-
-} // mayStore = 1, mayLoad = 0
-
-let mayLoad = 1, mayStore = 0 in {
-
-multiclass MTBUF_Load_Helper <bits<3> op, string opName,
- RegisterClass regClass> : MTBUF_m <
- op, opName, (outs regClass:$dst),
- (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
- i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
- i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset),
- opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
- #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
->;
-
-} // mayLoad = 1, mayStore = 0
-
-//===----------------------------------------------------------------------===//
-// MUBUF classes
-//===----------------------------------------------------------------------===//
-
-class mubuf <bits<7> si, bits<7> vi = si> {
- field bits<7> SI = si;
- field bits<7> VI = vi;
-}
-
-let isCodeGenOnly = 0 in {
-
-class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
- MUBUF <outs, ins, asm, pattern>, MUBUFe <op> {
- let lds = 0;
-}
-
-} // End let isCodeGenOnly = 0
-
-class MUBUF_vi <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
- MUBUF <outs, ins, asm, pattern>, MUBUFe_vi <op> {
- let lds = 0;
-}
-
-class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
- bit IsAddr64 = is_addr64;
- string OpName = NAME # suffix;
-}
-
-class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
- MUBUF <outs, ins, "", pattern>,
- SIMCInstr<opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
-
- // dummy fields, so that we can use let statements around multiclasses
- bits<1> offen;
- bits<1> idxen;
- bits<8> vaddr;
- bits<1> glc;
- bits<1> slc;
- bits<1> tfe;
- bits<8> soffset;
-}
-
-class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins,
- string asm> :
- MUBUF <outs, ins, asm, []>,
- MUBUFe <op.SI>,
- SIMCInstr<opName, SIEncodingFamily.SI> {
- let lds = 0;
- let AssemblerPredicate = SIAssemblerPredicate;
- let DecoderNamespace="SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins,
- string asm> :
- MUBUF <outs, ins, asm, []>,
- MUBUFe_vi <op.VI>,
- SIMCInstr<opName, SIEncodingFamily.VI> {
- let lds = 0;
- let AssemblerPredicate = VIAssemblerPredicate;
- let DecoderNamespace="VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm,
- list<dag> pattern> {
-
- def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
- MUBUFAddr64Table <0>;
-
- let DisableWQM = 1 in {
- def "_exact" : MUBUF_Pseudo <opName, outs, ins, []>;
- }
-
- let addr64 = 0, isCodeGenOnly = 0 in {
- def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
- }
-
- def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
-}
-
-multiclass MUBUFAddr64_m <mubuf op, string opName, dag outs,
- dag ins, string asm, list<dag> pattern> {
-
- def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
- MUBUFAddr64Table <1>;
-
- let addr64 = 1, isCodeGenOnly = 0 in {
- def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
- }
-
- // There is no VI version. If the pseudo is selected, it should be lowered
- // for VI appropriately.
-}
-
-multiclass MUBUFAtomicOffset_m <mubuf op, string opName, dag outs, dag ins,
- string asm, list<dag> pattern, bit is_return> {
-
- def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
- MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>,
- AtomicNoRet<NAME#"_OFFSET", is_return>;
-
- let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in {
- let addr64 = 0 in {
- def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
- }
-
- def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
- }
-}
-
-multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins,
- string asm, list<dag> pattern, bit is_return> {
-
- def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
- MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>,
- AtomicNoRet<NAME#"_ADDR64", is_return>;
-
- let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in {
- def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
- }
-
- // There is no VI version. If the pseudo is selected, it should be lowered
- // for VI appropriately.
-}
-
-multiclass MUBUFAtomicOther_m <mubuf op, string opName, dag outs, dag ins,
- string asm, list<dag> pattern, bit is_return> {
-
- def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
- AtomicNoRet<opName, is_return>;
-
- let tfe = 0 in {
- let addr64 = 0 in {
- def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
- }
-
- def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
- }
-}
-
-multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
- ValueType vt, SDPatternOperator atomic> {
-
- let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1,
- DisableWQM = 1 in {
-
- // No return variants
- let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in {
-
- defm _ADDR64 : MUBUFAtomicAddr64_m <
- op, name#"_addr64", (outs),
- (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc,
- SCSrc_32:$soffset, offset:$offset, slc:$slc),
- name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset$slc", [], 0
- >;
-
- defm _OFFSET : MUBUFAtomicOffset_m <
- op, name#"_offset", (outs),
- (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, offset:$offset,
- slc:$slc),
- name#" $vdata, off, $srsrc, $soffset$offset$slc", [], 0
- >;
-
- let offen = 1, idxen = 0 in {
- defm _OFFEN : MUBUFAtomicOther_m <
- op, name#"_offen", (outs),
- (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
- offset:$offset, slc:$slc),
- name#" $vdata, $vaddr, $srsrc, $soffset offen$offset$slc", [], 0
- >;
- }
-
- let offen = 0, idxen = 1 in {
- defm _IDXEN : MUBUFAtomicOther_m <
- op, name#"_idxen", (outs),
- (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
- offset:$offset, slc:$slc),
- name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$slc", [], 0
- >;
- }
-
- let offen = 1, idxen = 1 in {
- defm _BOTHEN : MUBUFAtomicOther_m <
- op, name#"_bothen", (outs),
- (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
- offset:$offset, slc:$slc),
- name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$slc",
- [], 0
- >;
- }
- } // glc = 0
-
- // Variant that return values
- let glc = 1, Constraints = "$vdata = $vdata_in",
- AsmMatchConverter = "cvtMubufAtomicReturn",
- DisableEncoding = "$vdata_in" in {
-
- defm _RTN_ADDR64 : MUBUFAtomicAddr64_m <
- op, name#"_rtn_addr64", (outs rc:$vdata),
- (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
- SCSrc_32:$soffset, offset:$offset, slc:$slc),
- name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset glc$slc",
- [(set vt:$vdata,
- (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$slc), vt:$vdata_in))], 1
- >;
-
- defm _RTN_OFFSET : MUBUFAtomicOffset_m <
- op, name#"_rtn_offset", (outs rc:$vdata),
- (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset,
- offset:$offset, slc:$slc),
- name#" $vdata, off, $srsrc, $soffset$offset glc$slc",
- [(set vt:$vdata,
- (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
- i1:$slc), vt:$vdata_in))], 1
- >;
-
- let offen = 1, idxen = 0 in {
- defm _RTN_OFFEN : MUBUFAtomicOther_m <
- op, name#"_rtn_offen", (outs rc:$vdata),
- (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
- offset:$offset, slc:$slc),
- name#" $vdata, $vaddr, $srsrc, $soffset offen$offset glc$slc",
- [], 1
- >;
- }
-
- let offen = 0, idxen = 1 in {
- defm _RTN_IDXEN : MUBUFAtomicOther_m <
- op, name#"_rtn_idxen", (outs rc:$vdata),
- (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
- offset:$offset, slc:$slc),
- name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset glc$slc",
- [], 1
- >;
- }
-
- let offen = 1, idxen = 1 in {
- defm _RTN_BOTHEN : MUBUFAtomicOther_m <
- op, name#"_rtn_bothen", (outs rc:$vdata),
- (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
- offset:$offset, slc:$slc),
- name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset glc$slc",
- [], 1
- >;
- }
- } // glc = 1
-
- } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
-}
-
-// FIXME: tfe can't be an operand because it requires a separate
-// opcode because it needs an N+1 register class dest register.
-multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
- ValueType load_vt = i32,
- SDPatternOperator ld = null_frag> {
-
- let mayLoad = 1, mayStore = 0 in {
- let offen = 0, idxen = 0, vaddr = 0 in {
- defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata),
- (ins SReg_128:$srsrc, SCSrc_32:$soffset,
- offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
- name#" $vdata, off, $srsrc, $soffset$offset$glc$slc$tfe",
- [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
- i32:$soffset, i16:$offset,
- i1:$glc, i1:$slc, i1:$tfe)))]>;
- }
-
- let offen = 1, idxen = 0 in {
- defm _OFFEN : MUBUF_m <op, name#"_offen", (outs regClass:$vdata),
- (ins VGPR_32:$vaddr, SReg_128:$srsrc,
- SCSrc_32:$soffset, offset:$offset, glc:$glc, slc:$slc,
- tfe:$tfe),
- name#" $vdata, $vaddr, $srsrc, $soffset offen$offset$glc$slc$tfe", []>;
- }
-
- let offen = 0, idxen = 1 in {
- defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata),
- (ins VGPR_32:$vaddr, SReg_128:$srsrc,
- SCSrc_32:$soffset, offset:$offset, glc:$glc,
- slc:$slc, tfe:$tfe),
- name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$glc$slc$tfe", []>;
- }
-
- let offen = 1, idxen = 1 in {
- defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata),
- (ins VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
- offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
- name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$glc$slc$tfe", []>;
- }
-
- let offen = 0, idxen = 0 in {
- defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata),
- (ins VReg_64:$vaddr, SReg_128:$srsrc,
- SCSrc_32:$soffset, offset:$offset,
- glc:$glc, slc:$slc, tfe:$tfe),
- name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset$glc$slc$tfe",
- [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
- i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc,
- i1:$tfe)))]>;
- }
- }
-}
-
-multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
- ValueType store_vt = i32, SDPatternOperator st = null_frag> {
- let mayLoad = 0, mayStore = 1 in {
- let offen = 0, idxen = 0, vaddr = 0 in {
- defm _OFFSET : MUBUF_m <op, name#"_offset",(outs),
- (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset,
- offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
- name#" $vdata, off, $srsrc, $soffset$offset$glc$slc$tfe",
- [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>;
- } // offen = 0, idxen = 0, vaddr = 0
-
- let offen = 1, idxen = 0 in {
- defm _OFFEN : MUBUF_m <op, name#"_offen", (outs),
- (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc,
- SCSrc_32:$soffset, offset:$offset, glc:$glc,
- slc:$slc, tfe:$tfe),
- name#" $vdata, $vaddr, $srsrc, $soffset offen"#
- "$offset$glc$slc$tfe", []>;
- } // end offen = 1, idxen = 0
-
- let offen = 0, idxen = 1 in {
- defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs),
- (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc,
- SCSrc_32:$soffset, offset:$offset, glc:$glc,
- slc:$slc, tfe:$tfe),
- name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$glc$slc$tfe", []>;
- }
-
- let offen = 1, idxen = 1 in {
- defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs),
- (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
- offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
- name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$glc$slc$tfe", []>;
- }
-
- let offen = 0, idxen = 0 in {
- defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs),
- (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc,
- SCSrc_32:$soffset,
- offset:$offset, glc:$glc, slc:$slc,
- tfe:$tfe),
- name#" $vdata, $vaddr, $srsrc, $soffset addr64"#
- "$offset$glc$slc$tfe",
- [(st store_vt:$vdata,
- (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr,
- i32:$soffset, i16:$offset,
- i1:$glc, i1:$slc, i1:$tfe))]>;
- }
- } // End mayLoad = 0, mayStore = 1
-}
-
-// For cache invalidation instructions.
-multiclass MUBUF_Invalidate <mubuf op, string opName, SDPatternOperator node> {
- let hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" in {
- def "" : MUBUF_Pseudo <opName, (outs), (ins), [(node)]>;
-
- // Set everything to 0.
- let offset = 0, offen = 0, idxen = 0, glc = 0, vaddr = 0,
- vdata = 0, srsrc = 0, slc = 0, tfe = 0, soffset = 0 in {
- let addr64 = 0 in {
- def _si : MUBUF_Real_si <op, opName, (outs), (ins), opName>;
- }
-
- def _vi : MUBUF_Real_vi <op, opName, (outs), (ins), opName>;
- }
- } // End hasSideEffects = 1, mayStore = 1, AsmMatchConverter = ""
-}
-
-//===----------------------------------------------------------------------===//
-// FLAT classes
-//===----------------------------------------------------------------------===//
-
-class flat <bits<7> ci, bits<7> vi = ci> {
- field bits<7> CI = ci;
- field bits<7> VI = vi;
-}
-
-class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
- FLAT <0, outs, ins, "", pattern>,
- SIMCInstr<opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
-}
-
-class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> :
- FLAT <op, outs, ins, asm, []>,
- SIMCInstr<opName, SIEncodingFamily.SI> {
- let AssemblerPredicate = isCIOnly;
- let DecoderNamespace="CI";
-}
-
-class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> :
- FLAT <op, outs, ins, asm, []>,
- SIMCInstr<opName, SIEncodingFamily.VI> {
- let AssemblerPredicate = VIAssemblerPredicate;
- let DecoderNamespace="VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm,
- list<dag> pattern> {
- def "" : FLAT_Pseudo <NAME#"_RTN", outs, ins, pattern>,
- AtomicNoRet <NAME, 1>;
-
- def _ci : FLAT_Real_ci <op.CI, NAME#"_RTN", outs, ins, asm>;
-
- def _vi : FLAT_Real_vi <op.VI, NAME#"_RTN", outs, ins, asm>;
-}
-
-multiclass FLAT_Load_Helper <flat op, string asm_name,
- RegisterClass regClass,
- dag outs = (outs regClass:$vdst),
- dag ins = (ins VReg_64:$addr, glc:$glc, slc:$slc, tfe:$tfe),
- string asm = asm_name#" $vdst, $addr$glc$slc$tfe"> {
-
- let data = 0, mayLoad = 1 in {
-
- def "" : FLAT_Pseudo <NAME, outs, ins, []>;
-
- def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>;
-
- def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>;
- }
-}
-
-multiclass FLAT_Store_Helper <flat op, string asm_name,
- RegisterClass vdataClass,
- dag outs = (outs),
- dag ins = (ins VReg_64:$addr, vdataClass:$data, glc:$glc,
- slc:$slc, tfe:$tfe),
- string asm = asm_name#" $addr, $data$glc$slc$tfe"> {
-
- let mayLoad = 0, mayStore = 1, vdst = 0 in {
-
- def "" : FLAT_Pseudo <NAME, outs, ins, []>;
-
- def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>;
-
- def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>;
- }
-}
-
-multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc,
- ValueType vt, SDPatternOperator atomic = null_frag,
- ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc,
- string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> {
-
- let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in {
- def "" : FLAT_Pseudo <NAME, (outs),
- (ins VReg_64:$addr, data_rc:$data,
- slc:$slc, tfe:$tfe), []>,
- AtomicNoRet <NAME, 0>;
-
- def _ci : FLAT_Real_ci <op.CI, NAME, (outs),
- (ins VReg_64:$addr, data_rc:$data,
- slc:$slc, tfe:$tfe),
- asm_noret>;
-
- def _vi : FLAT_Real_vi <op.VI, NAME, (outs),
- (ins VReg_64:$addr, data_rc:$data,
- slc:$slc, tfe:$tfe),
- asm_noret>;
- }
-
- let glc = 1, hasPostISelHook = 1 in {
- defm _RTN : FLAT_AtomicRet_m <
- op, (outs vdst_rc:$vdst),
- (ins VReg_64:$addr, data_rc:$data, slc:$slc, tfe:$tfe),
- asm_name#" $vdst, $addr, $data glc$slc$tfe",
- [(set vt:$vdst,
- (atomic (FLATAtomic i64:$addr, i1:$slc, i1:$tfe), data_vt:$data))]
- >;
- }
-}
-
-class MIMG_Mask <string op, int channels> {
- string Op = op;
- int Channels = channels;
-}
-
-class mimg <bits<7> si, bits<7> vi = si> {
- field bits<7> SI = si;
- field bits<7> VI = vi;
-}
-
-class MIMG_Helper <dag outs, dag ins, string asm,
- string dns=""> : MIMG<outs, ins, asm,[]> {
- let mayLoad = 1;
- let mayStore = 0;
- let hasPostISelHook = 1;
- let DecoderNamespace = dns;
- let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
- let AsmMatchConverter = "cvtMIMG";
-}
-
-class MIMG_NoSampler_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- RegisterClass addr_rc,
- string dns=""> : MIMG_Helper <
- (outs dst_rc:$vdata),
- (ins addr_rc:$vaddr, SReg_256:$srsrc,
- dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da",
- dns>, MIMGe<op> {
- let ssamp = 0;
-}
-
-multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- int channels> {
- def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
- !if(!eq(channels, 1), "AMDGPU", "")>,
- MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
- MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
- MIMG_Mask<asm#"_V4", channels>;
-}
-
-multiclass MIMG_NoSampler <bits<7> op, string asm> {
- defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
- defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
- defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
- defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
-}
-
-class MIMG_Store_Helper <bits<7> op, string asm,
- RegisterClass data_rc,
- RegisterClass addr_rc> : MIMG_Helper <
- (outs),
- (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
- dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
- >, MIMGe<op> {
- let ssamp = 0;
- let mayLoad = 1; // TableGen requires this for matching with the intrinsics
- let mayStore = 1;
- let hasSideEffects = 1;
- let hasPostISelHook = 0;
- let DisableWQM = 1;
-}
-
-multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
- RegisterClass data_rc,
- int channels> {
- def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32>,
- MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>,
- MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>,
- MIMG_Mask<asm#"_V4", channels>;
-}
-
-multiclass MIMG_Store <bits<7> op, string asm> {
- defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
- defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>;
- defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>;
- defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>;
-}
-
-class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
- RegisterClass addr_rc> : MIMG_Helper <
- (outs data_rc:$vdst),
- (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
- dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
- > {
- let mayStore = 1;
- let hasSideEffects = 1;
- let hasPostISelHook = 0;
- let DisableWQM = 1;
- let Constraints = "$vdst = $vdata";
- let AsmMatchConverter = "cvtMIMGAtomic";
-}
-
-class MIMG_Atomic_Real_si<mimg op, string name, string asm,
- RegisterClass data_rc, RegisterClass addr_rc> :
- MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
- SIMCInstr<name, SIEncodingFamily.SI>,
- MIMGe<op.SI> {
- let isCodeGenOnly = 0;
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
- let DisableDecoder = DisableSIDecoder;
-}
-
-class MIMG_Atomic_Real_vi<mimg op, string name, string asm,
- RegisterClass data_rc, RegisterClass addr_rc> :
- MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
- SIMCInstr<name, SIEncodingFamily.VI>,
- MIMGe<op.VI> {
- let isCodeGenOnly = 0;
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
- let DisableDecoder = DisableVIDecoder;
-}
-
-multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm,
- RegisterClass data_rc, RegisterClass addr_rc> {
- let isPseudo = 1, isCodeGenOnly = 1 in {
- def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
- SIMCInstr<name, SIEncodingFamily.NONE>;
- }
-
- let ssamp = 0 in {
- def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>;
-
- def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>;
- }
-}
-
-multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> {
- defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>;
- defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>;
- defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>;
-}
-
-class MIMG_Sampler_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- RegisterClass src_rc,
- int wqm,
- string dns=""> : MIMG_Helper <
- (outs dst_rc:$vdata),
- (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
- dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
- dns>, MIMGe<op> {
- let WQM = wqm;
-}
-
-multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- int channels, int wqm> {
- def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm,
- !if(!eq(channels, 1), "AMDGPU", "")>,
- MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
- MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>,
- MIMG_Mask<asm#"_V4", channels>;
- def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>,
- MIMG_Mask<asm#"_V8", channels>;
- def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>,
- MIMG_Mask<asm#"_V16", channels>;
-}
-
-multiclass MIMG_Sampler <bits<7> op, string asm, int wqm=0> {
- defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>;
- defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>;
- defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>;
- defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>;
-}
-
-multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>;
-
-class MIMG_Gather_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- RegisterClass src_rc, int wqm> : MIMG <
- (outs dst_rc:$vdata),
- (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
- dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
- r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
- asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
- []>, MIMGe<op> {
- let mayLoad = 1;
- let mayStore = 0;
-
- // DMASK was repurposed for GATHER4. 4 components are always
- // returned and DMASK works like a swizzle - it selects
- // the component to fetch. The only useful DMASK values are
- // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
- // (red,red,red,red) etc.) The ISA document doesn't mention
- // this.
- // Therefore, disable all code which updates DMASK by setting this:
- let Gather4 = 1;
- let hasPostISelHook = 0;
- let WQM = wqm;
-
- let isAsmParserOnly = 1; // TBD: fix it later
-}
-
-multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
- RegisterClass dst_rc,
- int channels, int wqm> {
- def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>,
- MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>,
- MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>,
- MIMG_Mask<asm#"_V4", channels>;
- def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>,
- MIMG_Mask<asm#"_V8", channels>;
- def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>,
- MIMG_Mask<asm#"_V16", channels>;
-}
-
-multiclass MIMG_Gather <bits<7> op, string asm, int wqm=0> {
- defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>;
- defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>;
- defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>;
- defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>;
-}
-
-multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>;
-
-//===----------------------------------------------------------------------===//
// Vector instruction mappings
//===----------------------------------------------------------------------===//
@@ -3604,18 +1199,18 @@ multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>;
def getVOPe64 : InstrMapping {
let FilterClass = "VOP";
let RowFields = ["OpName"];
- let ColFields = ["Size"];
- let KeyCol = ["4"];
- let ValueCols = [["8"]];
+ let ColFields = ["Size", "VOP3"];
+ let KeyCol = ["4", "0"];
+ let ValueCols = [["8", "1"]];
}
// Maps an opcode in e64 form to its e32 equivalent
def getVOPe32 : InstrMapping {
let FilterClass = "VOP";
let RowFields = ["OpName"];
- let ColFields = ["Size"];
- let KeyCol = ["8"];
- let ValueCols = [["4"]];
+ let ColFields = ["Size", "VOP3"];
+ let KeyCol = ["8", "1"];
+ let ValueCols = [["4", "0"]];
}
def getMaskedMIMGOp : InstrMapping {
@@ -3628,7 +1223,7 @@ def getMaskedMIMGOp : InstrMapping {
// Maps an commuted opcode to its original version
def getCommuteOrig : InstrMapping {
- let FilterClass = "VOP2_REV";
+ let FilterClass = "Commutable_REV";
let RowFields = ["RevOp"];
let ColFields = ["IsOrig"];
let KeyCol = ["0"];
@@ -3637,31 +1232,13 @@ def getCommuteOrig : InstrMapping {
// Maps an original opcode to its commuted version
def getCommuteRev : InstrMapping {
- let FilterClass = "VOP2_REV";
- let RowFields = ["RevOp"];
- let ColFields = ["IsOrig"];
- let KeyCol = ["1"];
- let ValueCols = [["0"]];
-}
-
-def getCommuteCmpOrig : InstrMapping {
- let FilterClass = "VOP2_REV";
- let RowFields = ["RevOp"];
- let ColFields = ["IsOrig"];
- let KeyCol = ["0"];
- let ValueCols = [["1"]];
-}
-
-// Maps an original opcode to its commuted version
-def getCommuteCmpRev : InstrMapping {
- let FilterClass = "VOP2_REV";
+ let FilterClass = "Commutable_REV";
let RowFields = ["RevOp"];
let ColFields = ["IsOrig"];
let KeyCol = ["1"];
let ValueCols = [["0"]];
}
-
def getMCOpcodeGen : InstrMapping {
let FilterClass = "SIMCInstr";
let RowFields = ["PseudoInstr"];
@@ -3671,6 +1248,15 @@ def getMCOpcodeGen : InstrMapping {
[!cast<string>(SIEncodingFamily.VI)]];
}
+// Get equivalent SOPK instruction.
+def getSOPKOp : InstrMapping {
+ let FilterClass = "SOPKInstTable";
+ let RowFields = ["BaseCmpOp"];
+ let ColFields = ["IsSOPK"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
def getAddr64Inst : InstrMapping {
let FilterClass = "MUBUFAddr64Table";
let RowFields = ["OpName"];
@@ -3699,4 +1285,6 @@ def getAtomicNoRetOp : InstrMapping {
include "SIInstructions.td"
include "CIInstructions.td"
-include "VIInstructions.td"
+
+include "DSInstructions.td"
+include "MIMGInstructions.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
index dde5f2f..38e31e7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -11,13 +11,6 @@
// that are not yet supported remain commented out.
//===----------------------------------------------------------------------===//
-class InterpSlots {
-int P0 = 2;
-int P10 = 0;
-int P20 = 1;
-}
-def INTERP : InterpSlots;
-
def isGCN : Predicate<"Subtarget->getGeneration() "
">= SISubtarget::SOUTHERN_ISLANDS">,
AssemblerPredicate<"FeatureGCN">;
@@ -25,9 +18,18 @@ def isSI : Predicate<"Subtarget->getGeneration() "
"== SISubtarget::SOUTHERN_ISLANDS">,
AssemblerPredicate<"FeatureSouthernIslands">;
-
def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
+def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
+ AssemblerPredicate<"FeatureVGPRIndexMode">;
+def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
+ AssemblerPredicate<"FeatureMovrel">;
+
+include "VOPInstructions.td"
+include "SOPInstructions.td"
+include "SMInstructions.td"
+include "FLATInstructions.td"
+include "BUFInstructions.td"
let SubtargetPredicate = isGCN in {
@@ -35,1393 +37,8 @@ let SubtargetPredicate = isGCN in {
// EXP Instructions
//===----------------------------------------------------------------------===//
-defm EXP : EXP_m;
-
-//===----------------------------------------------------------------------===//
-// SMRD Instructions
-//===----------------------------------------------------------------------===//
-
-// We are using the SReg_32_XM0 and not the SReg_32 register class for 32-bit
-// SMRD instructions, because the SReg_32_XM0 register class does not include M0
-// and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SReg_32_XM0>;
-defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>;
-
-defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
- smrd<0x08>, "s_buffer_load_dword", SReg_128, SReg_32_XM0
->;
-
-defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
- smrd<0x09>, "s_buffer_load_dwordx2", SReg_128, SReg_64
->;
-
-defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
- smrd<0x0a>, "s_buffer_load_dwordx4", SReg_128, SReg_128
->;
-
-defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
- smrd<0x0b>, "s_buffer_load_dwordx8", SReg_128, SReg_256
->;
-
-defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
- smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512
->;
-
-let mayStore = ? in {
-// FIXME: mayStore = ? is a workaround for tablegen bug for different
-// inferred mayStore flags for the instruction pattern vs. standalone
-// Pat. Each considers the other contradictory.
-
-defm S_MEMTIME : SMRD_Special <smrd<0x1e, 0x24>, "s_memtime",
- (outs SReg_64:$sdst), ?, " $sdst", [(set i64:$sdst, (int_amdgcn_s_memtime))]
->;
-}
-
-defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv",
- int_amdgcn_s_dcache_inv>;
-
-//===----------------------------------------------------------------------===//
-// SOP1 Instructions
-//===----------------------------------------------------------------------===//
-
-let isMoveImm = 1 in {
- let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
- defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>;
- defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>;
- } // End isRematerializeable = 1
-
- let Uses = [SCC] in {
- defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>;
- defm S_CMOV_B64 : SOP1_64 <sop1<0x06, 0x03>, "s_cmov_b64", []>;
- } // End Uses = [SCC]
-} // End isMoveImm = 1
-
-let Defs = [SCC] in {
- defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32",
- [(set i32:$sdst, (not i32:$src0))]
- >;
-
- defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64",
- [(set i64:$sdst, (not i64:$src0))]
- >;
- defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>;
- defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>;
-} // End Defs = [SCC]
-
-
-defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
- [(set i32:$sdst, (bitreverse i32:$src0))]
->;
-defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
-
-let Defs = [SCC] in {
- defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>;
- defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>;
- defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32",
- [(set i32:$sdst, (ctpop i32:$src0))]
- >;
- defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>;
-} // End Defs = [SCC]
-
-defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>;
-defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>;
-defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32",
- [(set i32:$sdst, (cttz_zero_undef i32:$src0))]
->;
-defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>;
-
-defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
- [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
->;
-
-defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
-defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32",
- [(set i32:$sdst, (int_AMDGPU_flbit_i32 i32:$src0))]
->;
-defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>;
-defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8",
- [(set i32:$sdst, (sext_inreg i32:$src0, i8))]
->;
-defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16",
- [(set i32:$sdst, (sext_inreg i32:$src0, i16))]
->;
-
-defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>;
-defm S_BITSET0_B64 : SOP1_64_32 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>;
-defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>;
-defm S_BITSET1_B64 : SOP1_64_32 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>;
-defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>;
-defm S_SETPC_B64 : SOP1_1 <sop1<0x20, 0x1d>, "s_setpc_b64", []>;
-defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>;
-defm S_RFE_B64 : SOP1_1 <sop1<0x22, 0x1f>, "s_rfe_b64", []>;
-
-let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
-
-defm S_AND_SAVEEXEC_B64 : SOP1_64 <sop1<0x24, 0x20>, "s_and_saveexec_b64", []>;
-defm S_OR_SAVEEXEC_B64 : SOP1_64 <sop1<0x25, 0x21>, "s_or_saveexec_b64", []>;
-defm S_XOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x26, 0x22>, "s_xor_saveexec_b64", []>;
-defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x27, 0x23>, "s_andn2_saveexec_b64", []>;
-defm S_ORN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x28, 0x24>, "s_orn2_saveexec_b64", []>;
-defm S_NAND_SAVEEXEC_B64 : SOP1_64 <sop1<0x29, 0x25>, "s_nand_saveexec_b64", []>;
-defm S_NOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2a, 0x26>, "s_nor_saveexec_b64", []>;
-defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>;
-
-} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
-
-defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>;
-defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>;
-
-let Uses = [M0] in {
-defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>;
-defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>;
-defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>;
-defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>;
-} // End Uses = [M0]
-
-defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>;
-defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>;
-let Defs = [SCC] in {
- defm S_ABS_I32 : SOP1_32 <sop1<0x34, 0x30>, "s_abs_i32", []>;
-} // End Defs = [SCC]
-defm S_MOV_FED_B32 : SOP1_32 <sop1<0x35, 0x31>, "s_mov_fed_b32", []>;
-
-//===----------------------------------------------------------------------===//
-// SOP2 Instructions
-//===----------------------------------------------------------------------===//
-
-let Defs = [SCC] in { // Carry out goes to SCC
-let isCommutable = 1 in {
-defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>;
-defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32",
- [(set i32:$sdst, (add SSrc_32:$src0, SSrc_32:$src1))]
->;
-} // End isCommutable = 1
-
-defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>;
-defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32",
- [(set i32:$sdst, (sub SSrc_32:$src0, SSrc_32:$src1))]
->;
-
-let Uses = [SCC] in { // Carry in comes from SCC
-let isCommutable = 1 in {
-defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32",
- [(set i32:$sdst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
-} // End isCommutable = 1
-
-defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32",
- [(set i32:$sdst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
-} // End Uses = [SCC]
-
-defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32",
- [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
->;
-defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32",
- [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
->;
-defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32",
- [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
->;
-defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32",
- [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
->;
-} // End Defs = [SCC]
-
-
-let Uses = [SCC] in {
- defm S_CSELECT_B32 : SOP2_32 <sop2<0x0a>, "s_cselect_b32", []>;
- defm S_CSELECT_B64 : SOP2_64 <sop2<0x0b>, "s_cselect_b64", []>;
-} // End Uses = [SCC]
-
-let Defs = [SCC] in {
-defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32",
- [(set i32:$sdst, (and i32:$src0, i32:$src1))]
->;
-
-defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64",
- [(set i64:$sdst, (and i64:$src0, i64:$src1))]
->;
-
-defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32",
- [(set i32:$sdst, (or i32:$src0, i32:$src1))]
->;
-
-defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64",
- [(set i64:$sdst, (or i64:$src0, i64:$src1))]
->;
-
-defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32",
- [(set i32:$sdst, (xor i32:$src0, i32:$src1))]
->;
-
-defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64",
- [(set i64:$sdst, (xor i64:$src0, i64:$src1))]
->;
-defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>;
-defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>;
-defm S_ORN2_B32 : SOP2_32 <sop2<0x16, 0x14>, "s_orn2_b32", []>;
-defm S_ORN2_B64 : SOP2_64 <sop2<0x17, 0x15>, "s_orn2_b64", []>;
-defm S_NAND_B32 : SOP2_32 <sop2<0x18, 0x16>, "s_nand_b32", []>;
-defm S_NAND_B64 : SOP2_64 <sop2<0x19, 0x17>, "s_nand_b64", []>;
-defm S_NOR_B32 : SOP2_32 <sop2<0x1a, 0x18>, "s_nor_b32", []>;
-defm S_NOR_B64 : SOP2_64 <sop2<0x1b, 0x19>, "s_nor_b64", []>;
-defm S_XNOR_B32 : SOP2_32 <sop2<0x1c, 0x1a>, "s_xnor_b32", []>;
-defm S_XNOR_B64 : SOP2_64 <sop2<0x1d, 0x1b>, "s_xnor_b64", []>;
-} // End Defs = [SCC]
-
-// Use added complexity so these patterns are preferred to the VALU patterns.
-let AddedComplexity = 1 in {
-let Defs = [SCC] in {
-
-defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32",
- [(set i32:$sdst, (shl i32:$src0, i32:$src1))]
->;
-defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64",
- [(set i64:$sdst, (shl i64:$src0, i32:$src1))]
->;
-defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32",
- [(set i32:$sdst, (srl i32:$src0, i32:$src1))]
->;
-defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64",
- [(set i64:$sdst, (srl i64:$src0, i32:$src1))]
->;
-defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32",
- [(set i32:$sdst, (sra i32:$src0, i32:$src1))]
->;
-defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64",
- [(set i64:$sdst, (sra i64:$src0, i32:$src1))]
->;
-} // End Defs = [SCC]
-
-defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32",
- [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
-defm S_BFM_B64 : SOP2_64_32_32 <sop2<0x25, 0x23>, "s_bfm_b64", []>;
-defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32",
- [(set i32:$sdst, (mul i32:$src0, i32:$src1))]
->;
-
-} // End AddedComplexity = 1
-
-let Defs = [SCC] in {
-defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>;
-defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>;
-defm S_BFE_U64 : SOP2_64_32 <sop2<0x29, 0x27>, "s_bfe_u64", []>;
-defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>;
-} // End Defs = [SCC]
-
-let sdst = 0 in {
-defm S_CBRANCH_G_FORK : SOP2_m <
- sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs),
- (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", []
->;
-}
-
-let Defs = [SCC] in {
-defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>;
-} // End Defs = [SCC]
-
-//===----------------------------------------------------------------------===//
-// SOPC Instructions
-//===----------------------------------------------------------------------===//
-
-def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00000000, "s_cmp_eq_i32", COND_EQ>;
-def S_CMP_LG_I32 : SOPC_CMP_32 <0x00000001, "s_cmp_lg_i32", COND_NE>;
-def S_CMP_GT_I32 : SOPC_CMP_32 <0x00000002, "s_cmp_gt_i32", COND_SGT>;
-def S_CMP_GE_I32 : SOPC_CMP_32 <0x00000003, "s_cmp_ge_i32", COND_SGE>;
-def S_CMP_LT_I32 : SOPC_CMP_32 <0x00000004, "s_cmp_lt_i32", COND_SLT>;
-def S_CMP_LE_I32 : SOPC_CMP_32 <0x00000005, "s_cmp_le_i32", COND_SLE>;
-def S_CMP_EQ_U32 : SOPC_CMP_32 <0x00000006, "s_cmp_eq_u32", COND_EQ>;
-def S_CMP_LG_U32 : SOPC_CMP_32 <0x00000007, "s_cmp_lg_u32", COND_NE >;
-def S_CMP_GT_U32 : SOPC_CMP_32 <0x00000008, "s_cmp_gt_u32", COND_UGT>;
-def S_CMP_GE_U32 : SOPC_CMP_32 <0x00000009, "s_cmp_ge_u32", COND_UGE>;
-def S_CMP_LT_U32 : SOPC_CMP_32 <0x0000000a, "s_cmp_lt_u32", COND_ULT>;
-def S_CMP_LE_U32 : SOPC_CMP_32 <0x0000000b, "s_cmp_le_u32", COND_ULE>;
-def S_BITCMP0_B32 : SOPC_32 <0x0000000c, "s_bitcmp0_b32">;
-def S_BITCMP1_B32 : SOPC_32 <0x0000000d, "s_bitcmp1_b32">;
-def S_BITCMP0_B64 : SOPC_64_32 <0x0000000e, "s_bitcmp0_b64">;
-def S_BITCMP1_B64 : SOPC_64_32 <0x0000000f, "s_bitcmp1_b64">;
-def S_SETVSKIP : SOPC_32 <0x00000010, "s_setvskip">;
-
-//===----------------------------------------------------------------------===//
-// SOPK Instructions
-//===----------------------------------------------------------------------===//
-
-let isReMaterializable = 1, isMoveImm = 1 in {
-defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>;
-} // End isReMaterializable = 1
-let Uses = [SCC] in {
- defm S_CMOVK_I32 : SOPK_32 <sopk<0x02, 0x01>, "s_cmovk_i32", []>;
-}
-
-let isCompare = 1 in {
-
-/*
-This instruction is disabled for now until we can figure out how to teach
-the instruction selector to correctly use the S_CMP* vs V_CMP*
-instructions.
-
-When this instruction is enabled the code generator sometimes produces this
-invalid sequence:
-
-SCC = S_CMPK_EQ_I32 SGPR0, imm
-VCC = COPY SCC
-VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
-
-defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32",
- [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
->;
-*/
-
-defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", []>;
-defm S_CMPK_LG_I32 : SOPK_SCC <sopk<0x04, 0x03>, "s_cmpk_lg_i32", []>;
-defm S_CMPK_GT_I32 : SOPK_SCC <sopk<0x05, 0x04>, "s_cmpk_gt_i32", []>;
-defm S_CMPK_GE_I32 : SOPK_SCC <sopk<0x06, 0x05>, "s_cmpk_ge_i32", []>;
-defm S_CMPK_LT_I32 : SOPK_SCC <sopk<0x07, 0x06>, "s_cmpk_lt_i32", []>;
-defm S_CMPK_LE_I32 : SOPK_SCC <sopk<0x08, 0x07>, "s_cmpk_le_i32", []>;
-defm S_CMPK_EQ_U32 : SOPK_SCC <sopk<0x09, 0x08>, "s_cmpk_eq_u32", []>;
-defm S_CMPK_LG_U32 : SOPK_SCC <sopk<0x0a, 0x09>, "s_cmpk_lg_u32", []>;
-defm S_CMPK_GT_U32 : SOPK_SCC <sopk<0x0b, 0x0a>, "s_cmpk_gt_u32", []>;
-defm S_CMPK_GE_U32 : SOPK_SCC <sopk<0x0c, 0x0b>, "s_cmpk_ge_u32", []>;
-defm S_CMPK_LT_U32 : SOPK_SCC <sopk<0x0d, 0x0c>, "s_cmpk_lt_u32", []>;
-defm S_CMPK_LE_U32 : SOPK_SCC <sopk<0x0e, 0x0d>, "s_cmpk_le_u32", []>;
-} // End isCompare = 1
-
-let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0",
- Constraints = "$sdst = $src0" in {
- defm S_ADDK_I32 : SOPK_32TIE <sopk<0x0f, 0x0e>, "s_addk_i32", []>;
- defm S_MULK_I32 : SOPK_32TIE <sopk<0x10, 0x0f>, "s_mulk_i32", []>;
-}
-
-defm S_CBRANCH_I_FORK : SOPK_m <
- sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs),
- (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16"
->;
-
-let mayLoad = 1 in {
-defm S_GETREG_B32 : SOPK_m <
- sopk<0x12, 0x11>, "s_getreg_b32", (outs SReg_32:$sdst),
- (ins hwreg:$simm16), " $sdst, $simm16"
->;
-}
-
-defm S_SETREG_B32 : SOPK_m <
- sopk<0x13, 0x12>, "s_setreg_b32", (outs),
- (ins SReg_32:$sdst, hwreg:$simm16), " $simm16, $sdst"
->;
-// FIXME: Not on SI?
-//defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>;
-defm S_SETREG_IMM32_B32 : SOPK_IMM32 <
- sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs),
- (ins i32imm:$imm, hwreg:$simm16), " $simm16, $imm"
->;
-
-//===----------------------------------------------------------------------===//
-// SOPP Instructions
-//===----------------------------------------------------------------------===//
-
-def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
-
-let isTerminator = 1 in {
-
-def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
- [(AMDGPUendpgm)]> {
- let simm16 = 0;
- let isBarrier = 1;
- let hasCtrlDep = 1;
- let hasSideEffects = 1;
-}
-
-let isBranch = 1 in {
-def S_BRANCH : SOPP <
- 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
- [(br bb:$simm16)]> {
- let isBarrier = 1;
-}
-
-let Uses = [SCC] in {
-def S_CBRANCH_SCC0 : SOPP <
- 0x00000004, (ins sopp_brtarget:$simm16),
- "s_cbranch_scc0 $simm16"
->;
-def S_CBRANCH_SCC1 : SOPP <
- 0x00000005, (ins sopp_brtarget:$simm16),
- "s_cbranch_scc1 $simm16",
- [(si_uniform_br_scc SCC, bb:$simm16)]
->;
-} // End Uses = [SCC]
-
-let Uses = [VCC] in {
-def S_CBRANCH_VCCZ : SOPP <
- 0x00000006, (ins sopp_brtarget:$simm16),
- "s_cbranch_vccz $simm16"
->;
-def S_CBRANCH_VCCNZ : SOPP <
- 0x00000007, (ins sopp_brtarget:$simm16),
- "s_cbranch_vccnz $simm16"
->;
-} // End Uses = [VCC]
-
-let Uses = [EXEC] in {
-def S_CBRANCH_EXECZ : SOPP <
- 0x00000008, (ins sopp_brtarget:$simm16),
- "s_cbranch_execz $simm16"
->;
-def S_CBRANCH_EXECNZ : SOPP <
- 0x00000009, (ins sopp_brtarget:$simm16),
- "s_cbranch_execnz $simm16"
->;
-} // End Uses = [EXEC]
-
-
-} // End isBranch = 1
-} // End isTerminator = 1
-
-let hasSideEffects = 1 in {
-def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
- [(int_amdgcn_s_barrier)]
-> {
- let SchedRW = [WriteBarrier];
- let simm16 = 0;
- let mayLoad = 1;
- let mayStore = 1;
- let isConvergent = 1;
-}
-
-let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
-def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
-def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
-
-// On SI the documentation says sleep for approximately 64 * low 2
-// bits, consistent with the reported maximum of 448. On VI the
-// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the
-// maximum really 15 on VI?
-def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
- "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> {
- let hasSideEffects = 1;
- let mayLoad = 1;
- let mayStore = 1;
-}
-
-def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
-
-let Uses = [EXEC, M0] in {
- // FIXME: Should this be mayLoad+mayStore?
- def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
- [(AMDGPUsendmsg (i32 imm:$simm16))]
- >;
-} // End Uses = [EXEC, M0]
-
-def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16">;
-def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
-def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
- let simm16 = 0;
-}
-def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">;
-def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">;
-def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
- let simm16 = 0;
-}
-} // End hasSideEffects
-
-//===----------------------------------------------------------------------===//
-// VOPC Instructions
-//===----------------------------------------------------------------------===//
-
-let isCompare = 1, isCommutable = 1 in {
-
-defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">;
-defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">;
-defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>;
-defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">;
-defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>;
-defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>;
-defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>;
-defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>;
-defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>;
-defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">;
-defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>;
-defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">;
-defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>;
-defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>;
-defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>;
-defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">;
-
-
-defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">;
-defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32", "v_cmpx_gt_f32">;
-defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">;
-defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32", "v_cmpx_ge_f32">;
-defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">;
-defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">;
-defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">;
-defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17, 0x57>, "v_cmpx_o_f32">;
-defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18, 0x58>, "v_cmpx_u_f32">;
-defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19, 0x59>, "v_cmpx_nge_f32">;
-defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a, 0x5a>, "v_cmpx_nlg_f32">;
-defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b, 0x5b>, "v_cmpx_ngt_f32">;
-defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c, 0x5c>, "v_cmpx_nle_f32">;
-defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">;
-defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">;
-defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">;
-
-
-defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">;
-defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">;
-defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>;
-defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">;
-defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>;
-defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>;
-defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>;
-defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>;
-defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>;
-defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">;
-defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>;
-defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">;
-defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>;
-defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>;
-defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>;
-defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">;
-
-
-defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">;
-defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64", "v_cmpx_gt_f64">;
-defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">;
-defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64", "v_cmpx_ge_f64">;
-defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">;
-defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">;
-defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">;
-defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">;
-defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">;
-defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64", "v_cmpx_nle_f64">;
-defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">;
-defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64", "v_cmpx_nlt_f64">;
-defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">;
-defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">;
-defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">;
-defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">;
-
-
-let SubtargetPredicate = isSICI in {
-
-defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">;
-defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">;
-defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">;
-defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">;
-defm V_CMPS_GT_F32 : VOPC_F32 <vopc<0x44>, "v_cmps_gt_f32">;
-defm V_CMPS_LG_F32 : VOPC_F32 <vopc<0x45>, "v_cmps_lg_f32">;
-defm V_CMPS_GE_F32 : VOPC_F32 <vopc<0x46>, "v_cmps_ge_f32">;
-defm V_CMPS_O_F32 : VOPC_F32 <vopc<0x47>, "v_cmps_o_f32">;
-defm V_CMPS_U_F32 : VOPC_F32 <vopc<0x48>, "v_cmps_u_f32">;
-defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">;
-defm V_CMPS_NLG_F32 : VOPC_F32 <vopc<0x4a>, "v_cmps_nlg_f32">;
-defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">;
-defm V_CMPS_NLE_F32 : VOPC_F32 <vopc<0x4c>, "v_cmps_nle_f32">;
-defm V_CMPS_NEQ_F32 : VOPC_F32 <vopc<0x4d>, "v_cmps_neq_f32">;
-defm V_CMPS_NLT_F32 : VOPC_F32 <vopc<0x4e>, "v_cmps_nlt_f32">;
-defm V_CMPS_TRU_F32 : VOPC_F32 <vopc<0x4f>, "v_cmps_tru_f32">;
-
-
-defm V_CMPSX_F_F32 : VOPCX_F32 <vopc<0x50>, "v_cmpsx_f_f32">;
-defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32", "v_cmpsx_gt_f32">;
-defm V_CMPSX_EQ_F32 : VOPCX_F32 <vopc<0x52>, "v_cmpsx_eq_f32">;
-defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32", "v_cmpsx_ge_f32">;
-defm V_CMPSX_GT_F32 : VOPCX_F32 <vopc<0x54>, "v_cmpsx_gt_f32">;
-defm V_CMPSX_LG_F32 : VOPCX_F32 <vopc<0x55>, "v_cmpsx_lg_f32">;
-defm V_CMPSX_GE_F32 : VOPCX_F32 <vopc<0x56>, "v_cmpsx_ge_f32">;
-defm V_CMPSX_O_F32 : VOPCX_F32 <vopc<0x57>, "v_cmpsx_o_f32">;
-defm V_CMPSX_U_F32 : VOPCX_F32 <vopc<0x58>, "v_cmpsx_u_f32">;
-defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32", "v_cmpsx_nle_f32">;
-defm V_CMPSX_NLG_F32 : VOPCX_F32 <vopc<0x5a>, "v_cmpsx_nlg_f32">;
-defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">;
-defm V_CMPSX_NLE_F32 : VOPCX_F32 <vopc<0x5c>, "v_cmpsx_nle_f32">;
-defm V_CMPSX_NEQ_F32 : VOPCX_F32 <vopc<0x5d>, "v_cmpsx_neq_f32">;
-defm V_CMPSX_NLT_F32 : VOPCX_F32 <vopc<0x5e>, "v_cmpsx_nlt_f32">;
-defm V_CMPSX_TRU_F32 : VOPCX_F32 <vopc<0x5f>, "v_cmpsx_tru_f32">;
-
-
-defm V_CMPS_F_F64 : VOPC_F64 <vopc<0x60>, "v_cmps_f_f64">;
-defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">;
-defm V_CMPS_EQ_F64 : VOPC_F64 <vopc<0x62>, "v_cmps_eq_f64">;
-defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">;
-defm V_CMPS_GT_F64 : VOPC_F64 <vopc<0x64>, "v_cmps_gt_f64">;
-defm V_CMPS_LG_F64 : VOPC_F64 <vopc<0x65>, "v_cmps_lg_f64">;
-defm V_CMPS_GE_F64 : VOPC_F64 <vopc<0x66>, "v_cmps_ge_f64">;
-defm V_CMPS_O_F64 : VOPC_F64 <vopc<0x67>, "v_cmps_o_f64">;
-defm V_CMPS_U_F64 : VOPC_F64 <vopc<0x68>, "v_cmps_u_f64">;
-defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">;
-defm V_CMPS_NLG_F64 : VOPC_F64 <vopc<0x6a>, "v_cmps_nlg_f64">;
-defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">;
-defm V_CMPS_NLE_F64 : VOPC_F64 <vopc<0x6c>, "v_cmps_nle_f64">;
-defm V_CMPS_NEQ_F64 : VOPC_F64 <vopc<0x6d>, "v_cmps_neq_f64">;
-defm V_CMPS_NLT_F64 : VOPC_F64 <vopc<0x6e>, "v_cmps_nlt_f64">;
-defm V_CMPS_TRU_F64 : VOPC_F64 <vopc<0x6f>, "v_cmps_tru_f64">;
-
-
-defm V_CMPSX_F_F64 : VOPCX_F64 <vopc<0x70>, "v_cmpsx_f_f64">;
-defm V_CMPSX_LT_F64 : VOPCX_F64 <vopc<0x71>, "v_cmpsx_lt_f64", "v_cmpsx_gt_f64">;
-defm V_CMPSX_EQ_F64 : VOPCX_F64 <vopc<0x72>, "v_cmpsx_eq_f64">;
-defm V_CMPSX_LE_F64 : VOPCX_F64 <vopc<0x73>, "v_cmpsx_le_f64", "v_cmpsx_ge_f64">;
-defm V_CMPSX_GT_F64 : VOPCX_F64 <vopc<0x74>, "v_cmpsx_gt_f64">;
-defm V_CMPSX_LG_F64 : VOPCX_F64 <vopc<0x75>, "v_cmpsx_lg_f64">;
-defm V_CMPSX_GE_F64 : VOPCX_F64 <vopc<0x76>, "v_cmpsx_ge_f64">;
-defm V_CMPSX_O_F64 : VOPCX_F64 <vopc<0x77>, "v_cmpsx_o_f64">;
-defm V_CMPSX_U_F64 : VOPCX_F64 <vopc<0x78>, "v_cmpsx_u_f64">;
-defm V_CMPSX_NGE_F64 : VOPCX_F64 <vopc<0x79>, "v_cmpsx_nge_f64", "v_cmpsx_nle_f64">;
-defm V_CMPSX_NLG_F64 : VOPCX_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">;
-defm V_CMPSX_NGT_F64 : VOPCX_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">;
-defm V_CMPSX_NLE_F64 : VOPCX_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">;
-defm V_CMPSX_NEQ_F64 : VOPCX_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">;
-defm V_CMPSX_NLT_F64 : VOPCX_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">;
-defm V_CMPSX_TRU_F64 : VOPCX_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">;
-
-} // End SubtargetPredicate = isSICI
-
-defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">;
-defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">;
-defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>;
-defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">;
-defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>;
-defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>;
-defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>;
-defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">;
-
-
-defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">;
-defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32", "v_cmpx_gt_i32">;
-defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">;
-defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32", "v_cmpx_ge_i32">;
-defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">;
-defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">;
-defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">;
-defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">;
-
-
-defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">;
-defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">;
-defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>;
-defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">;
-defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>;
-defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>;
-defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>;
-defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">;
-
-
-defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">;
-defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64", "v_cmpx_gt_i64">;
-defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">;
-defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64", "v_cmpx_ge_i64">;
-defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">;
-defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">;
-defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">;
-defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">;
-
-
-defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">;
-defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">;
-defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>;
-defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">;
-defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>;
-defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>;
-defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>;
-defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">;
-
-
-defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">;
-defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32", "v_cmpx_gt_u32">;
-defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">;
-defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32", "v_cmpx_le_u32">;
-defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">;
-defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">;
-defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">;
-defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">;
-
-
-defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">;
-defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">;
-defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>;
-defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">;
-defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>;
-defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>;
-defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>;
-defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">;
-
-defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">;
-defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64", "v_cmpx_gt_u64">;
-defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">;
-defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64", "v_cmpx_ge_u64">;
-defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">;
-defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">;
-defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">;
-defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">;
-
-} // End isCompare = 1, isCommutable = 1
-
-defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">;
-defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">;
-defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">;
-defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">;
-
-//===----------------------------------------------------------------------===//
-// DS Instructions
-//===----------------------------------------------------------------------===//
-
-defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>;
-defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>;
-defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>;
-defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>;
-defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>;
-defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>;
-defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>;
-defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>;
-defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>;
-defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>;
-defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>;
-defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>;
-defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>;
-let mayLoad = 0 in {
-defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>;
-defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>;
-defm DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>;
-}
-defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>;
-defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>;
-defm DS_MIN_F32 : DS_1A2D_NORET <0x12, "ds_min_f32", VGPR_32>;
-defm DS_MAX_F32 : DS_1A2D_NORET <0x13, "ds_max_f32", VGPR_32>;
-
-defm DS_GWS_INIT : DS_1A_GDS <0x19, "ds_gws_init">;
-defm DS_GWS_SEMA_V : DS_1A_GDS <0x1a, "ds_gws_sema_v">;
-defm DS_GWS_SEMA_BR : DS_1A_GDS <0x1b, "ds_gws_sema_br">;
-defm DS_GWS_SEMA_P : DS_1A_GDS <0x1c, "ds_gws_sema_p">;
-defm DS_GWS_BARRIER : DS_1A_GDS <0x1d, "ds_gws_barrier">;
-let mayLoad = 0 in {
-defm DS_WRITE_B8 : DS_1A1D_NORET <0x1e, "ds_write_b8", VGPR_32>;
-defm DS_WRITE_B16 : DS_1A1D_NORET <0x1f, "ds_write_b16", VGPR_32>;
-}
-defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">;
-defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">;
-defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">;
-defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">;
-defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">;
-defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">;
-defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">;
-defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">;
-defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">;
-defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">;
-defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">;
-defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
-defm DS_MSKOR_RTN_B32 : DS_1A2D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
-defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>;
-defm DS_WRXCHG2_RTN_B32 : DS_1A2D_RET <
- 0x2e, "ds_wrxchg2_rtn_b32", VReg_64, "", VGPR_32
->;
-defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET <
- 0x2f, "ds_wrxchg2st64_rtn_b32", VReg_64, "", VGPR_32
->;
-defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
-defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
-defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
-defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
-
-let Uses = [EXEC], mayLoad =0, mayStore = 0, isConvergent = 1 in {
-defm DS_SWIZZLE_B32 : DS_1A_RET_ <dsop<0x35, 0x3d>, "ds_swizzle_b32", VGPR_32>;
-}
-
-let mayStore = 0 in {
-defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>;
-defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>;
-defm DS_READ2ST64_B32 : DS_1A_Off8_RET <0x38, "ds_read2st64_b32", VReg_64>;
-defm DS_READ_I8 : DS_1A_RET <0x39, "ds_read_i8", VGPR_32>;
-defm DS_READ_U8 : DS_1A_RET <0x3a, "ds_read_u8", VGPR_32>;
-defm DS_READ_I16 : DS_1A_RET <0x3b, "ds_read_i16", VGPR_32>;
-defm DS_READ_U16 : DS_1A_RET <0x3c, "ds_read_u16", VGPR_32>;
-}
-defm DS_CONSUME : DS_0A_RET <0x3d, "ds_consume">;
-defm DS_APPEND : DS_0A_RET <0x3e, "ds_append">;
-defm DS_ORDERED_COUNT : DS_1A_RET_GDS <0x3f, "ds_ordered_count">;
-defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>;
-defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>;
-defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>;
-defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>;
-defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>;
-defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>;
-defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>;
-defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>;
-defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>;
-defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>;
-defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>;
-defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
-defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
-let mayLoad = 0 in {
-defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>;
-defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>;
-defm DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>;
-}
-defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
-defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
-defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>;
-defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>;
-
-defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">;
-defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">;
-defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">;
-defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">;
-defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">;
-defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">;
-defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">;
-defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">;
-defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">;
-defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">;
-defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">;
-defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">;
-defm DS_MSKOR_RTN_B64 : DS_1A2D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">;
-defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">;
-defm DS_WRXCHG2_RTN_B64 : DS_1A2D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_128, "ds_wrxchg2_b64", VReg_64>;
-defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET <0x6f, "ds_wrxchg2st64_rtn_b64", VReg_128, "ds_wrxchg2st64_b64", VReg_64>;
-defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">;
-defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">;
-defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">;
-defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">;
-
-let mayStore = 0 in {
-defm DS_READ_B64 : DS_1A_RET <0x76, "ds_read_b64", VReg_64>;
-defm DS_READ2_B64 : DS_1A_Off8_RET <0x77, "ds_read2_b64", VReg_128>;
-defm DS_READ2ST64_B64 : DS_1A_Off8_RET <0x78, "ds_read2st64_b64", VReg_128>;
-}
-
-defm DS_ADD_SRC2_U32 : DS_1A <0x80, "ds_add_src2_u32">;
-defm DS_SUB_SRC2_U32 : DS_1A <0x81, "ds_sub_src2_u32">;
-defm DS_RSUB_SRC2_U32 : DS_1A <0x82, "ds_rsub_src2_u32">;
-defm DS_INC_SRC2_U32 : DS_1A <0x83, "ds_inc_src2_u32">;
-defm DS_DEC_SRC2_U32 : DS_1A <0x84, "ds_dec_src2_u32">;
-defm DS_MIN_SRC2_I32 : DS_1A <0x85, "ds_min_src2_i32">;
-defm DS_MAX_SRC2_I32 : DS_1A <0x86, "ds_max_src2_i32">;
-defm DS_MIN_SRC2_U32 : DS_1A <0x87, "ds_min_src2_u32">;
-defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">;
-defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">;
-defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">;
-defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">;
-defm DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET <0x8d, "ds_write_src2_b32">;
-
-defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">;
-defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">;
-
-defm DS_ADD_SRC2_U64 : DS_1A <0xc0, "ds_add_src2_u64">;
-defm DS_SUB_SRC2_U64 : DS_1A <0xc1, "ds_sub_src2_u64">;
-defm DS_RSUB_SRC2_U64 : DS_1A <0xc2, "ds_rsub_src2_u64">;
-defm DS_INC_SRC2_U64 : DS_1A <0xc3, "ds_inc_src2_u64">;
-defm DS_DEC_SRC2_U64 : DS_1A <0xc4, "ds_dec_src2_u64">;
-defm DS_MIN_SRC2_I64 : DS_1A <0xc5, "ds_min_src2_i64">;
-defm DS_MAX_SRC2_I64 : DS_1A <0xc6, "ds_max_src2_i64">;
-defm DS_MIN_SRC2_U64 : DS_1A <0xc7, "ds_min_src2_u64">;
-defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">;
-defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">;
-defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">;
-defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">;
-defm DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET <0xcd, "ds_write_src2_b64">;
-
-defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">;
-defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">;
-
-//===----------------------------------------------------------------------===//
-// MUBUF Instructions
-//===----------------------------------------------------------------------===//
-
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper <
- mubuf<0x00>, "buffer_load_format_x", VGPR_32
->;
-defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper <
- mubuf<0x01>, "buffer_load_format_xy", VReg_64
->;
-defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper <
- mubuf<0x02>, "buffer_load_format_xyz", VReg_96
->;
-defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <
- mubuf<0x03>, "buffer_load_format_xyzw", VReg_128
->;
-defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper <
- mubuf<0x04>, "buffer_store_format_x", VGPR_32
->;
-defm BUFFER_STORE_FORMAT_XY : MUBUF_Store_Helper <
- mubuf<0x05>, "buffer_store_format_xy", VReg_64
->;
-defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Store_Helper <
- mubuf<0x06>, "buffer_store_format_xyz", VReg_96
->;
-defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper <
- mubuf<0x07>, "buffer_store_format_xyzw", VReg_128
->;
-defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
- mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8
->;
-defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
- mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8
->;
-defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
- mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16
->;
-defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
- mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16
->;
-defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
- mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load
->;
-defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
- mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load
->;
-defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
- mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
->;
-
-defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
- mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global
->;
-
-defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
- mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global
->;
-
-defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
- mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store
->;
-
-defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
- mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store
->;
-
-defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
- mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store
->;
-
-defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
- mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
->;
-defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Atomic <
- mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag
->;
-defm BUFFER_ATOMIC_ADD : MUBUF_Atomic <
- mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global
->;
-defm BUFFER_ATOMIC_SUB : MUBUF_Atomic <
- mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
->;
-//def BUFFER_ATOMIC_RSUB : MUBUF_ <mubuf<0x34>, "buffer_atomic_rsub", []>; // isn't on CI & VI
-defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic <
- mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
->;
-defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic <
- mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
->;
-defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic <
- mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
->;
-defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic <
- mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
->;
-defm BUFFER_ATOMIC_AND : MUBUF_Atomic <
- mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global
->;
-defm BUFFER_ATOMIC_OR : MUBUF_Atomic <
- mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global
->;
-defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
- mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
->;
-defm BUFFER_ATOMIC_INC : MUBUF_Atomic <
- mubuf<0x3c, 0x4b>, "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global
->;
-defm BUFFER_ATOMIC_DEC : MUBUF_Atomic <
- mubuf<0x3d, 0x4c>, "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global
->;
-
-//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_Atomic <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI
-//def BUFFER_ATOMIC_FMIN : MUBUF_Atomic <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI
-//def BUFFER_ATOMIC_FMAX : MUBUF_Atomic <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI
-defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Atomic <
- mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global
->;
-defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic <
- mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
->;
-defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Atomic <
- mubuf<0x52, 0x62>, "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global
->;
-defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Atomic <
- mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global
->;
-//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Atomic <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI
-defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Atomic <
- mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global
->;
-defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Atomic <
- mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global
->;
-defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Atomic <
- mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global
->;
-defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Atomic <
- mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global
->;
-defm BUFFER_ATOMIC_AND_X2 : MUBUF_Atomic <
- mubuf<0x59, 0x68>, "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global
->;
-defm BUFFER_ATOMIC_OR_X2 : MUBUF_Atomic <
- mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global
->;
-defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Atomic <
- mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global
->;
-defm BUFFER_ATOMIC_INC_X2 : MUBUF_Atomic <
- mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global
->;
-defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Atomic <
- mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
->;
-//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI
-//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI
-//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI
-
-let SubtargetPredicate = isSI, DisableVIDecoder = 1 in {
-defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI
-}
-
-defm BUFFER_WBINVL1 : MUBUF_Invalidate <mubuf<0x71, 0x3e>, "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>;
-
-//===----------------------------------------------------------------------===//
-// MTBUF Instructions
-//===----------------------------------------------------------------------===//
-
-//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>;
-//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>;
-defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>;
-defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>;
-defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>;
-defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>;
-
-//===----------------------------------------------------------------------===//
-// MIMG Instructions
-//===----------------------------------------------------------------------===//
-
-defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">;
-defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
-//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>;
-//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
-//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
-//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
-defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">;
-defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">;
-//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
-//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
-defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
-defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
-defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>;
-defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
-defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">;
-//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI
-defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimg<0x14>, "image_atomic_smin">;
-defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimg<0x15>, "image_atomic_umin">;
-defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimg<0x16>, "image_atomic_smax">;
-defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimg<0x17>, "image_atomic_umax">;
-defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimg<0x18>, "image_atomic_and">;
-defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
-defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
-defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
-defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
-//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI
-//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
-//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
-defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">;
-defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
-defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">;
-defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
-defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">;
-defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
-defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
-defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">;
-defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
-defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
-defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
-defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
-defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
-defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
-defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
-defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
-defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
-defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
-defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">;
-defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
-defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">;
-defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
-defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
-defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
-defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
-defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
-defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
-defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
-defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
-defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
-defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
-defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
-defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">;
-defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
-defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">;
-defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
-defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
-defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">;
-defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
-defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
-defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
-defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
-defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
-defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
-defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
-defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
-defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">;
-defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
-defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
-defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
-defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
-defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
-defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
-defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
-defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
-defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
-defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">;
-defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
-defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
-defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">;
-defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">;
-defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">;
-defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">;
-//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
-//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
-
-//===----------------------------------------------------------------------===//
-// VOP1 Instructions
-//===----------------------------------------------------------------------===//
-
-let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in {
-defm V_NOP : VOP1Inst <vop1<0x0>, "v_nop", VOP_NONE>;
-}
-
-let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>;
-} // End isMoveImm = 1
-
-let Uses = [EXEC] in {
-
-// FIXME: Specify SchedRW for READFIRSTLANE_B32
-
-def V_READFIRSTLANE_B32 : VOP1 <
- 0x00000002,
- (outs SReg_32:$vdst),
- (ins VS_32:$src0),
- "v_readfirstlane_b32 $vdst, $src0",
- []
-> {
- let isConvergent = 1;
-}
-
-}
-
-let SchedRW = [WriteQuarterRate32] in {
-
-defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64",
- VOP_I32_F64, fp_to_sint
->;
-defm V_CVT_F64_I32 : VOP1Inst <vop1<0x4>, "v_cvt_f64_i32",
- VOP_F64_I32, sint_to_fp
->;
-defm V_CVT_F32_I32 : VOP1Inst <vop1<0x5>, "v_cvt_f32_i32",
- VOP_F32_I32, sint_to_fp
->;
-defm V_CVT_F32_U32 : VOP1Inst <vop1<0x6>, "v_cvt_f32_u32",
- VOP_F32_I32, uint_to_fp
->;
-defm V_CVT_U32_F32 : VOP1Inst <vop1<0x7>, "v_cvt_u32_f32",
- VOP_I32_F32, fp_to_uint
->;
-defm V_CVT_I32_F32 : VOP1Inst <vop1<0x8>, "v_cvt_i32_f32",
- VOP_I32_F32, fp_to_sint
->;
-defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32",
- VOP_I32_F32, fp_to_f16
->;
-defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16",
- VOP_F32_I32, f16_to_fp
->;
-defm V_CVT_RPI_I32_F32 : VOP1Inst <vop1<0xc>, "v_cvt_rpi_i32_f32",
- VOP_I32_F32, cvt_rpi_i32_f32>;
-defm V_CVT_FLR_I32_F32 : VOP1Inst <vop1<0xd>, "v_cvt_flr_i32_f32",
- VOP_I32_F32, cvt_flr_i32_f32>;
-defm V_CVT_OFF_F32_I4 : VOP1Inst <vop1<0x0e>, "v_cvt_off_f32_i4", VOP_F32_I32>;
-defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64",
- VOP_F32_F64, fround
->;
-defm V_CVT_F64_F32 : VOP1Inst <vop1<0x10>, "v_cvt_f64_f32",
- VOP_F64_F32, fextend
->;
-defm V_CVT_F32_UBYTE0 : VOP1Inst <vop1<0x11>, "v_cvt_f32_ubyte0",
- VOP_F32_I32, AMDGPUcvt_f32_ubyte0
->;
-defm V_CVT_F32_UBYTE1 : VOP1Inst <vop1<0x12>, "v_cvt_f32_ubyte1",
- VOP_F32_I32, AMDGPUcvt_f32_ubyte1
->;
-defm V_CVT_F32_UBYTE2 : VOP1Inst <vop1<0x13>, "v_cvt_f32_ubyte2",
- VOP_F32_I32, AMDGPUcvt_f32_ubyte2
->;
-defm V_CVT_F32_UBYTE3 : VOP1Inst <vop1<0x14>, "v_cvt_f32_ubyte3",
- VOP_F32_I32, AMDGPUcvt_f32_ubyte3
->;
-defm V_CVT_U32_F64 : VOP1Inst <vop1<0x15>, "v_cvt_u32_f64",
- VOP_I32_F64, fp_to_uint
->;
-defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32",
- VOP_F64_I32, uint_to_fp
->;
-
-} // End SchedRW = [WriteQuarterRate32]
-
-defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32",
- VOP_F32_F32, AMDGPUfract
->;
-defm V_TRUNC_F32 : VOP1Inst <vop1<0x21, 0x1c>, "v_trunc_f32",
- VOP_F32_F32, ftrunc
->;
-defm V_CEIL_F32 : VOP1Inst <vop1<0x22, 0x1d>, "v_ceil_f32",
- VOP_F32_F32, fceil
->;
-defm V_RNDNE_F32 : VOP1Inst <vop1<0x23, 0x1e>, "v_rndne_f32",
- VOP_F32_F32, frint
->;
-defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32",
- VOP_F32_F32, ffloor
->;
-defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32",
- VOP_F32_F32, fexp2
->;
-
-let SchedRW = [WriteQuarterRate32] in {
-
-defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32",
- VOP_F32_F32, flog2
->;
-defm V_RCP_F32 : VOP1Inst <vop1<0x2a, 0x22>, "v_rcp_f32",
- VOP_F32_F32, AMDGPUrcp
->;
-defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32",
- VOP_F32_F32
->;
-defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32",
- VOP_F32_F32, AMDGPUrsq
->;
-
-} // End SchedRW = [WriteQuarterRate32]
-
-let SchedRW = [WriteDouble] in {
-
-defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64",
- VOP_F64_F64, AMDGPUrcp
->;
-defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64",
- VOP_F64_F64, AMDGPUrsq
->;
-
-} // End SchedRW = [WriteDouble];
-
-defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32",
- VOP_F32_F32, fsqrt
->;
-
-let SchedRW = [WriteDouble] in {
-
-defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64",
- VOP_F64_F64, fsqrt
->;
-
-} // End SchedRW = [WriteDouble]
-
-let SchedRW = [WriteQuarterRate32] in {
-
-defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
- VOP_F32_F32, AMDGPUsin
->;
-defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32",
- VOP_F32_F32, AMDGPUcos
->;
-
-} // End SchedRW = [WriteQuarterRate32]
-
-defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>;
-defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>;
-defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>;
-defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>;
-defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>;
-defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64",
- VOP_I32_F64, int_amdgcn_frexp_exp
->;
-
-let SchedRW = [WriteDoubleAdd] in {
-defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64",
- VOP_F64_F64, int_amdgcn_frexp_mant
->;
-
-defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64",
- VOP_F64_F64, AMDGPUfract
->;
-} // End SchedRW = [WriteDoubleAdd]
-
-
-defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32",
- VOP_I32_F32, int_amdgcn_frexp_exp
->;
-defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32",
- VOP_F32_F32, int_amdgcn_frexp_mant
->;
-let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in {
-defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
-}
-
-let Uses = [M0, EXEC] in {
-defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_NO_EXT<VOP_I32_I32>>;
-defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_NO_EXT<VOP_I32_I32>>;
-defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
-} // End Uses = [M0, EXEC]
-
-// These instruction only exist on SI and CI
-let SubtargetPredicate = isSICI in {
-
-let SchedRW = [WriteQuarterRate32] in {
-
-defm V_MOV_FED_B32 : VOP1InstSI <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>;
-defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32",
- VOP_F32_F32, int_amdgcn_log_clamp>;
-defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
-defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
-defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32",
- VOP_F32_F32, AMDGPUrsq_clamp
->;
-defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
- VOP_F32_F32, AMDGPUrsq_legacy
->;
-
-} // End SchedRW = [WriteQuarterRate32]
-
-let SchedRW = [WriteDouble] in {
-
-defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
-defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
- VOP_F64_F64, AMDGPUrsq_clamp
->;
-
-} // End SchedRW = [WriteDouble]
-
-} // End SubtargetPredicate = isSICI
+defm EXP : EXP_m<0, AMDGPUexport>;
+defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
//===----------------------------------------------------------------------===//
// VINTRP Instructions
@@ -1433,11 +50,11 @@ let Uses = [M0, EXEC] in {
multiclass V_INTERP_P1_F32_m : VINTRP_m <
0x00000000,
- (outs VGPR_32:$dst),
- (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr),
- "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]",
- [(set f32:$dst, (AMDGPUinterp_p1 i32:$i, (i32 imm:$attr_chan),
- (i32 imm:$attr)))]
+ (outs VGPR_32:$vdst),
+ (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
+ "v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan",
+ [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
+ (i32 imm:$attr)))]
>;
let OtherPredicates = [has32BankLDS] in {
@@ -1446,459 +63,33 @@ defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
} // End OtherPredicates = [has32BankLDS]
-let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1 in {
+let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
-} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1
+} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
-let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in {
+let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
defm V_INTERP_P2_F32 : VINTRP_m <
0x00000001,
- (outs VGPR_32:$dst),
- (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr),
- "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [m0]",
- [(set f32:$dst, (AMDGPUinterp_p2 f32:$src0, i32:$j, (i32 imm:$attr_chan),
- (i32 imm:$attr)))]>;
+ (outs VGPR_32:$vdst),
+ (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
+ "v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan",
+ [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
+ (i32 imm:$attr)))]>;
-} // End DisableEncoding = "$src0", Constraints = "$src0 = $dst"
+} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
defm V_INTERP_MOV_F32 : VINTRP_m <
0x00000002,
- (outs VGPR_32:$dst),
- (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr),
- "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [m0]",
- [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan),
- (i32 imm:$attr)))]>;
-
-} // End Uses = [M0, EXEC]
-
-//===----------------------------------------------------------------------===//
-// VOP2 Instructions
-//===----------------------------------------------------------------------===//
-
-defm V_CNDMASK_B32 : VOP2eInst <vop2<0x0, 0x0>, "v_cndmask_b32",
- VOP2e_I32_I32_I32_I1
->;
-
-let isCommutable = 1 in {
-defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32",
- VOP_F32_F32_F32, fadd
->;
-
-defm V_SUB_F32 : VOP2Inst <vop2<0x4, 0x2>, "v_sub_f32", VOP_F32_F32_F32, fsub>;
-defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32",
- VOP_F32_F32_F32, null_frag, "v_sub_f32"
->;
-} // End isCommutable = 1
-
-let isCommutable = 1 in {
-
-defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32",
- VOP_F32_F32_F32
->;
-
-defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32",
- VOP_F32_F32_F32, fmul
->;
-
-defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9, 0x6>, "v_mul_i32_i24",
- VOP_I32_I32_I32, AMDGPUmul_i24
->;
-
-defm V_MUL_HI_I32_I24 : VOP2Inst <vop2<0xa,0x7>, "v_mul_hi_i32_i24",
- VOP_I32_I32_I32
->;
-
-defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb, 0x8>, "v_mul_u32_u24",
- VOP_I32_I32_I32, AMDGPUmul_u24
->;
-
-defm V_MUL_HI_U32_U24 : VOP2Inst <vop2<0xc,0x9>, "v_mul_hi_u32_u24",
- VOP_I32_I32_I32
->;
-
-defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32,
- fminnum>;
-defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32,
- fmaxnum>;
-defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32>;
-defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32>;
-defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32>;
-defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32>;
-
-defm V_LSHRREV_B32 : VOP2Inst <
- vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag,
- "v_lshr_b32"
->;
-
-defm V_ASHRREV_I32 : VOP2Inst <
- vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag,
- "v_ashr_i32"
->;
-
-defm V_LSHLREV_B32 : VOP2Inst <
- vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag,
- "v_lshl_b32"
->;
-
-defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>;
-defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>;
-defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>;
-
-let Constraints = "$vdst = $src2", DisableEncoding="$src2",
- isConvertibleToThreeAddress = 1 in {
-defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_MAC>;
-}
-} // End isCommutable = 1
-
-defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32", VOP_MADMK>;
-
-let isCommutable = 1 in {
-defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32", VOP_MADAK>;
-} // End isCommutable = 1
-
-let isCommutable = 1 in {
-// No patterns so that the scalar instructions are always selected.
-// The scalar versions will be replaced with vector when needed later.
-
-// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
-// but the VI instructions behave the same as the SI versions.
-defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32",
- VOP2b_I32_I1_I32_I32
->;
-defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP2b_I32_I1_I32_I32>;
-
-defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32",
- VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32"
->;
-
-defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32",
- VOP2b_I32_I1_I32_I32_I1
->;
-defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32",
- VOP2b_I32_I1_I32_I32_I1
->;
-defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
- VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32"
->;
-
-} // End isCommutable = 1
-
-// These are special and do not read the exec mask.
-let isConvergent = 1, Uses = []<Register> in {
-
-defm V_READLANE_B32 : VOP2SI_3VI_m <
- vop3 <0x001, 0x289>,
- "v_readlane_b32",
- (outs SReg_32:$vdst),
- (ins VS_32:$src0, SCSrc_32:$src1),
- "v_readlane_b32 $vdst, $src0, $src1"
->;
-
-defm V_WRITELANE_B32 : VOP2SI_3VI_m <
- vop3 <0x002, 0x28a>,
- "v_writelane_b32",
(outs VGPR_32:$vdst),
- (ins SReg_32:$src0, SCSrc_32:$src1),
- "v_writelane_b32 $vdst, $src0, $src1"
->;
-
-} // End isConvergent = 1
-
-// These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI in {
-
-let isCommutable = 1 in {
-defm V_MAC_LEGACY_F32 : VOP2InstSI <vop2<0x6>, "v_mac_legacy_f32",
- VOP_F32_F32_F32
->;
-} // End isCommutable = 1
-
-defm V_MIN_LEGACY_F32 : VOP2InstSI <vop2<0xd>, "v_min_legacy_f32",
- VOP_F32_F32_F32, AMDGPUfmin_legacy
->;
-defm V_MAX_LEGACY_F32 : VOP2InstSI <vop2<0xe>, "v_max_legacy_f32",
- VOP_F32_F32_F32, AMDGPUfmax_legacy
->;
-
-let isCommutable = 1 in {
-defm V_LSHR_B32 : VOP2InstSI <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32>;
-defm V_ASHR_I32 : VOP2InstSI <vop2<0x17>, "v_ashr_i32", VOP_I32_I32_I32>;
-defm V_LSHL_B32 : VOP2InstSI <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32>;
-} // End isCommutable = 1
-} // End let SubtargetPredicate = SICI
-
-defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32",
- VOP_I32_I32_I32
->;
-defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32",
- VOP_I32_I32_I32
->;
-defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32",
- VOP_I32_I32_I32, int_amdgcn_mbcnt_lo
->;
-defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32",
- VOP_I32_I32_I32, int_amdgcn_mbcnt_hi
->;
-defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32",
- VOP_F32_F32_I32, AMDGPUldexp
->;
-
-defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst <vop23<0x2c, 0x1f0>, "v_cvt_pkaccum_u8_f32",
- VOP_I32_F32_I32>; // TODO: set "Uses = dst"
-
-defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst <vop23<0x2d, 0x294>, "v_cvt_pknorm_i16_f32",
- VOP_I32_F32_F32
->;
-defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst <vop23<0x2e, 0x295>, "v_cvt_pknorm_u16_f32",
- VOP_I32_F32_F32
->;
-defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32",
- VOP_I32_F32_F32, int_SI_packf16
->;
-defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst <vop23<0x30, 0x297>, "v_cvt_pk_u16_u32",
- VOP_I32_I32_I32
->;
-defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst <vop23<0x31, 0x298>, "v_cvt_pk_i16_i32",
- VOP_I32_I32_I32
->;
-
-//===----------------------------------------------------------------------===//
-// VOP3 Instructions
-//===----------------------------------------------------------------------===//
-
-let isCommutable = 1 in {
-defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140, 0x1c0>, "v_mad_legacy_f32",
- VOP_F32_F32_F32_F32
->;
-
-defm V_MAD_F32 : VOP3Inst <vop3<0x141, 0x1c1>, "v_mad_f32",
- VOP_F32_F32_F32_F32, fmad
->;
-
-defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142, 0x1c2>, "v_mad_i32_i24",
- VOP_I32_I32_I32_I32, AMDGPUmad_i24
->;
-defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24",
- VOP_I32_I32_I32_I32, AMDGPUmad_u24
->;
-} // End isCommutable = 1
-
-defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32",
- VOP_F32_F32_F32_F32, int_amdgcn_cubeid
->;
-defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32",
- VOP_F32_F32_F32_F32, int_amdgcn_cubesc
->;
-defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32",
- VOP_F32_F32_F32_F32, int_amdgcn_cubetc
->;
-defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32",
- VOP_F32_F32_F32_F32, int_amdgcn_cubema
->;
-
-defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32",
- VOP_I32_I32_I32_I32, AMDGPUbfe_u32
->;
-defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32",
- VOP_I32_I32_I32_I32, AMDGPUbfe_i32
->;
-
-defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32",
- VOP_I32_I32_I32_I32, AMDGPUbfi
->;
-
-let isCommutable = 1 in {
-defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32",
- VOP_F32_F32_F32_F32, fma
->;
-defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64",
- VOP_F64_F64_F64_F64, fma
->;
-
-defm V_LERP_U8 : VOP3Inst <vop3<0x14d, 0x1cd>, "v_lerp_u8",
- VOP_I32_I32_I32_I32, int_amdgcn_lerp
->;
-} // End isCommutable = 1
-
-//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>;
-defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e, 0x1ce>, "v_alignbit_b32",
- VOP_I32_I32_I32_I32
->;
-defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32",
- VOP_I32_I32_I32_I32
->;
-
-defm V_MIN3_F32 : VOP3Inst <vop3<0x151, 0x1d0>, "v_min3_f32",
- VOP_F32_F32_F32_F32, AMDGPUfmin3>;
-
-defm V_MIN3_I32 : VOP3Inst <vop3<0x152, 0x1d1>, "v_min3_i32",
- VOP_I32_I32_I32_I32, AMDGPUsmin3
->;
-defm V_MIN3_U32 : VOP3Inst <vop3<0x153, 0x1d2>, "v_min3_u32",
- VOP_I32_I32_I32_I32, AMDGPUumin3
->;
-defm V_MAX3_F32 : VOP3Inst <vop3<0x154, 0x1d3>, "v_max3_f32",
- VOP_F32_F32_F32_F32, AMDGPUfmax3
->;
-defm V_MAX3_I32 : VOP3Inst <vop3<0x155, 0x1d4>, "v_max3_i32",
- VOP_I32_I32_I32_I32, AMDGPUsmax3
->;
-defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32",
- VOP_I32_I32_I32_I32, AMDGPUumax3
->;
-defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32",
- VOP_F32_F32_F32_F32, AMDGPUfmed3
->;
-defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32",
- VOP_I32_I32_I32_I32, AMDGPUsmed3
->;
-defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32",
- VOP_I32_I32_I32_I32, AMDGPUumed3
->;
-
-//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>;
-//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>;
-//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>;
-defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32",
- VOP_I32_I32_I32_I32
->;
-//def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>;
-defm V_DIV_FIXUP_F32 : VOP3Inst <
- vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
->;
-
-let SchedRW = [WriteDoubleAdd] in {
-
-defm V_DIV_FIXUP_F64 : VOP3Inst <
- vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
->;
-
-} // End SchedRW = [WriteDouble]
-
-let SchedRW = [WriteDoubleAdd] in {
-let isCommutable = 1 in {
-
-defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
- VOP_F64_F64_F64, fadd, 1
->;
-defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64",
- VOP_F64_F64_F64, fmul, 1
->;
-
-defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64",
- VOP_F64_F64_F64, fminnum, 1
->;
-defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64",
- VOP_F64_F64_F64, fmaxnum, 1
->;
-
-} // End isCommutable = 1
-
-defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
- VOP_F64_F64_I32, AMDGPUldexp, 1
->;
-
-} // End let SchedRW = [WriteDoubleAdd]
-
-let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
-
-defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32",
- VOP_I32_I32_I32
->;
-defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32",
- VOP_I32_I32_I32, mulhu
->;
-
-let DisableVIDecoder=1 in { // removed from VI as identical to V_MUL_LO_U32
-defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32",
- VOP_I32_I32_I32
->;
-}
-
-defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
- VOP_I32_I32_I32, mulhs
->;
-
-} // End isCommutable = 1, SchedRW = [WriteQuarterRate32]
-
-let SchedRW = [WriteFloatFMA, WriteSALU] in {
-defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32",
- VOP3b_F32_I1_F32_F32_F32, [], 1
->;
-}
-
-let SchedRW = [WriteDouble, WriteSALU] in {
-// Double precision division pre-scale.
-defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64",
- VOP3b_F64_I1_F64_F64_F64, [], 1
->;
-} // End SchedRW = [WriteDouble]
-
-let isCommutable = 1, Uses = [VCC, EXEC] in {
-
-let SchedRW = [WriteFloatFMA] in {
-// v_div_fmas_f32:
-// result = src0 * src1 + src2
-// if (vcc)
-// result *= 2^32
-//
-defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
- VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
->;
-}
-
-let SchedRW = [WriteDouble] in {
-// v_div_fmas_f64:
-// result = src0 * src1 + src2
-// if (vcc)
-// result *= 2^64
-//
-defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
- VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
->;
-
-} // End SchedRW = [WriteDouble]
-} // End isCommutable = 1, Uses = [VCC, EXEC]
-
-//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
-//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
-//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
-
-let SchedRW = [WriteDouble] in {
-defm V_TRIG_PREOP_F64 : VOP3Inst <
- vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
->;
-
-} // End SchedRW = [WriteDouble]
-
-// These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI in {
+ (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
+ "v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan",
+ [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
+ (i32 imm:$attr)))]>;
-defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", VOP_I64_I64_I32>;
-defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", VOP_I64_I64_I32>;
-defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", VOP_I64_I64_I32>;
-
-defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
- VOP_F32_F32_F32_F32>;
-
-} // End SubtargetPredicate = isSICI
-
-let SubtargetPredicate = isVI, DisableSIDecoder = 1 in {
-
-defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64",
- VOP_I64_I32_I64
->;
-defm V_LSHRREV_B64 : VOP3Inst <vop3<0, 0x290>, "v_lshrrev_b64",
- VOP_I64_I32_I64
->;
-defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64",
- VOP_I64_I32_I64
->;
-
-} // End SubtargetPredicate = isVI
+} // End Uses = [M0, EXEC]
//===----------------------------------------------------------------------===//
// Pseudo Instructions
@@ -1908,16 +99,16 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
// For use in patterns
def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
- (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []> {
+ (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
let isPseudo = 1;
let isCodeGenOnly = 1;
+ let usesCustomInserter = 1;
}
// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
// pass to enable folding of inline immediates.
-def V_MOV_B64_PSEUDO : PseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_64:$src0)> {
- let VALU = 1;
-}
+def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
+ (ins VSrc_b64:$src0)>;
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
let usesCustomInserter = 1, SALU = 1 in {
@@ -1925,83 +116,142 @@ def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
} // End let usesCustomInserter = 1, SALU = 1
+def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+ (ins SSrc_b64:$src0)> {
+ let SALU = 1;
+ let isAsCheapAsAMove = 1;
+ let isTerminator = 1;
+}
+
+def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+ (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
+ let SALU = 1;
+ let isAsCheapAsAMove = 1;
+ let isTerminator = 1;
+}
+
+def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+ (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
+ let SALU = 1;
+ let isAsCheapAsAMove = 1;
+ let isTerminator = 1;
+}
+
+def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
+ [(int_amdgcn_wave_barrier)]> {
+ let SchedRW = [];
+ let hasNoSchedulingInfo = 1;
+ let hasSideEffects = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let isBarrier = 1;
+ let isConvergent = 1;
+}
+
// SI pseudo instructions. These are used by the CFG structurizer pass
// and should be lowered to ISA instructions prior to codegen.
-let hasSideEffects = 1 in {
-
// Dummy terminator instruction to use after control flow instructions
// replaced with exec mask operations.
def SI_MASK_BRANCH : PseudoInstSI <
- (outs), (ins brtarget:$target, SReg_64:$dst)> {
- let isBranch = 1;
+ (outs), (ins brtarget:$target)> {
+ let isBranch = 0;
let isTerminator = 1;
- let isBarrier = 1;
- let SALU = 1;
+ let isBarrier = 0;
+ let Uses = [EXEC];
+ let SchedRW = [];
+ let hasNoSchedulingInfo = 1;
}
-let Uses = [EXEC], Defs = [EXEC, SCC] in {
-
-let isBranch = 1, isTerminator = 1 in {
+let isTerminator = 1 in {
-def SI_IF: PseudoInstSI <
+def SI_IF: CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
- [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))]> {
+ [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> {
let Constraints = "";
+ let Size = 12;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let hasSideEffects = 1;
}
-def SI_ELSE : PseudoInstSI <
- (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target),
- [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]> {
+def SI_ELSE : CFPseudoInstSI <
+ (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
let Constraints = "$src = $dst";
+ let Size = 12;
+ let mayStore = 1;
+ let mayLoad = 1;
+ let hasSideEffects = 1;
}
-def SI_LOOP : PseudoInstSI <
+def SI_LOOP : CFPseudoInstSI <
(outs), (ins SReg_64:$saved, brtarget:$target),
- [(int_amdgcn_loop i64:$saved, bb:$target)]
->;
+ [(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> {
+ let Size = 8;
+ let isBranch = 1;
+ let hasSideEffects = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
} // End isBranch = 1, isTerminator = 1
+def SI_END_CF : CFPseudoInstSI <
+ (outs), (ins SReg_64:$saved),
+ [(int_amdgcn_end_cf i64:$saved)], 1, 1> {
+ let Size = 4;
+ let isAsCheapAsAMove = 1;
+ let isReMaterializable = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let hasSideEffects = 1;
+}
-def SI_BREAK : PseudoInstSI <
+def SI_BREAK : CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$src),
- [(set i64:$dst, (int_amdgcn_break i64:$src))]
->;
+ [(set i64:$dst, (int_amdgcn_break i64:$src))], 1> {
+ let Size = 4;
+ let isAsCheapAsAMove = 1;
+ let isReMaterializable = 1;
+}
-def SI_IF_BREAK : PseudoInstSI <
+def SI_IF_BREAK : CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
- [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]
->;
+ [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
+ let Size = 4;
+ let isAsCheapAsAMove = 1;
+ let isReMaterializable = 1;
+}
-def SI_ELSE_BREAK : PseudoInstSI <
+def SI_ELSE_BREAK : CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
- [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]
->;
-
-def SI_END_CF : PseudoInstSI <
- (outs), (ins SReg_64:$saved),
- [(int_amdgcn_end_cf i64:$saved)]
->;
-
-} // End Uses = [EXEC], Defs = [EXEC, SCC]
+ [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> {
+ let Size = 4;
+ let isAsCheapAsAMove = 1;
+ let isReMaterializable = 1;
+}
let Uses = [EXEC], Defs = [EXEC,VCC] in {
def SI_KILL : PseudoInstSI <
- (outs), (ins VSrc_32:$src),
- [(int_AMDGPU_kill f32:$src)]> {
+ (outs), (ins VSrc_b32:$src),
+ [(AMDGPUkill i32:$src)]> {
let isConvergent = 1;
let usesCustomInserter = 1;
}
-def SI_KILL_TERMINATOR : PseudoInstSI <
- (outs), (ins VSrc_32:$src)> {
+def SI_KILL_TERMINATOR : SPseudoInstSI <
+ (outs), (ins VSrc_b32:$src)> {
let isTerminator = 1;
}
} // End Uses = [EXEC], Defs = [EXEC,VCC]
-} // End mayLoad = 1, mayStore = 1, hasSideEffects = 1
+// Branch on undef scc. Used to avoid intermediate copy from
+// IMPLICIT_DEF to SCC.
+def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
+ let isTerminator = 1;
+ let usesCustomInserter = 1;
+}
def SI_PS_LIVE : PseudoInstSI <
(outs SReg_64:$dst), (ins),
@@ -2013,36 +263,37 @@ def SI_PS_LIVE : PseudoInstSI <
// s_mov_b32 rather than a copy of another initialized
// register. MachineCSE skips copies, and we don't want to have to
// fold operands before it runs.
-def SI_INIT_M0 : PseudoInstSI <(outs), (ins SSrc_32:$src)> {
+def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
let Defs = [M0];
let usesCustomInserter = 1;
let isAsCheapAsAMove = 1;
- let SALU = 1;
let isReMaterializable = 1;
}
-def SI_RETURN : PseudoInstSI <
+def SI_RETURN : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn)]> {
let isTerminator = 1;
let isBarrier = 1;
let isReturn = 1;
let hasSideEffects = 1;
- let SALU = 1;
let hasNoSchedulingInfo = 1;
let DisableWQM = 1;
}
-let Uses = [EXEC], Defs = [EXEC, VCC, M0],
+let Defs = [M0, EXEC],
UseNamedOperandTable = 1 in {
-class SI_INDIRECT_SRC<RegisterClass rc> : PseudoInstSI <
- (outs VGPR_32:$vdst, SReg_64:$sdst),
- (ins rc:$src, VS_32:$idx, i32imm:$offset)>;
+class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
+ (outs VGPR_32:$vdst),
+ (ins rc:$src, VS_32:$idx, i32imm:$offset)> {
+ let usesCustomInserter = 1;
+}
-class SI_INDIRECT_DST<RegisterClass rc> : PseudoInstSI <
- (outs rc:$vdst, SReg_64:$sdst),
- (ins unknown:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
+class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
+ (outs rc:$vdst),
+ (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
let Constraints = "$src = $vdst";
+ let usesCustomInserter = 1;
}
// TODO: We can support indirect SGPR access.
@@ -2058,53 +309,60 @@ def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
-} // End Uses = [EXEC], Defs = [EXEC,VCC,M0]
+} // End Uses = [EXEC], Defs = [M0, EXEC]
multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
- let UseNamedOperandTable = 1, Uses = [EXEC] in {
+ let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
def _SAVE : PseudoInstSI <
(outs),
- (ins sgpr_class:$src, i32imm:$frame_idx)> {
+ (ins sgpr_class:$data, i32imm:$addr)> {
let mayStore = 1;
let mayLoad = 0;
}
def _RESTORE : PseudoInstSI <
- (outs sgpr_class:$dst),
- (ins i32imm:$frame_idx)> {
+ (outs sgpr_class:$data),
+ (ins i32imm:$addr)> {
let mayStore = 0;
let mayLoad = 1;
}
} // End UseNamedOperandTable = 1
}
-// It's unclear whether you can use M0 as the output of v_readlane_b32
-// instructions, so use SReg_32_XM0 register class for spills to prevent
-// this from happening.
-defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32_XM0>;
+// You cannot use M0 as the output of v_readlane_b32 instructions or
+// use it in the sdata operand of SMEM instructions. We still need to
+// be able to spill the physical register m0, so allow it for
+// SI_SPILL_32_* instructions.
+defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
- let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in {
- def _SAVE : PseudoInstSI <
+ let UseNamedOperandTable = 1, VGPRSpill = 1,
+ SchedRW = [WriteVMEM] in {
+ def _SAVE : VPseudoInstSI <
(outs),
- (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
- SReg_32:$scratch_offset, i32imm:$offset)> {
+ (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
+ SReg_32:$soffset, i32imm:$offset)> {
let mayStore = 1;
let mayLoad = 0;
+ // (2 * 4) + (8 * num_subregs) bytes maximum
+ let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
}
- def _RESTORE : PseudoInstSI <
- (outs vgpr_class:$dst),
- (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset,
+ def _RESTORE : VPseudoInstSI <
+ (outs vgpr_class:$vdata),
+ (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
i32imm:$offset)> {
let mayStore = 0;
let mayLoad = 1;
+
+ // (2 * 4) + (8 * num_subregs) bytes maximum
+ let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
}
- } // End UseNamedOperandTable = 1, VGPRSpill = 1
+ } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
}
defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
@@ -2114,344 +372,26 @@ defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
-let Defs = [SCC] in {
-
-def SI_PC_ADD_REL_OFFSET : PseudoInstSI <
+def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
(outs SReg_64:$dst),
- (ins si_ga:$ptr),
- [(set SReg_64:$dst, (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr))))]> {
- let SALU = 1;
+ (ins si_ga:$ptr_lo, si_ga:$ptr_hi),
+ [(set SReg_64:$dst,
+ (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> {
+ let Defs = [SCC];
}
-} // End Defs = [SCC]
-
} // End SubtargetPredicate = isGCN
let Predicates = [isGCN] in {
-def : Pat <
- (int_AMDGPU_kilp),
- (SI_KILL 0xbf800000)
->;
-
-/* int_SI_vs_load_input */
-def : Pat<
- (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
- (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0)
->;
-
-def : Pat <
- (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
- f32:$src0, f32:$src1, f32:$src2, f32:$src3),
- (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
- $src0, $src1, $src2, $src3)
->;
-
-//===----------------------------------------------------------------------===//
-// buffer_load/store_format patterns
-//===----------------------------------------------------------------------===//
-
-multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
- string opcode> {
- def : Pat<
- (vt (name v4i32:$rsrc, 0,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$glc, imm:$slc)),
- (!cast<MUBUF>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
- (as_i1imm $glc), (as_i1imm $slc), 0)
- >;
-
- def : Pat<
- (vt (name v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$glc, imm:$slc)),
- (!cast<MUBUF>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
- (as_i1imm $glc), (as_i1imm $slc), 0)
- >;
-
- def : Pat<
- (vt (name v4i32:$rsrc, 0,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$glc, imm:$slc)),
- (!cast<MUBUF>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
- (as_i1imm $glc), (as_i1imm $slc), 0)
- >;
-
- def : Pat<
- (vt (name v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$glc, imm:$slc)),
- (!cast<MUBUF>(opcode # _BOTHEN)
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset),
- (as_i1imm $glc), (as_i1imm $slc), 0)
- >;
-}
-
-defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
-defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
-defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
-defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, f32, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
-
-multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
- string opcode> {
- def : Pat<
- (name vt:$vdata, v4i32:$rsrc, 0,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
- (as_i1imm $glc), (as_i1imm $slc), 0)
- >;
-
- def : Pat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (as_i1imm $glc),
- (as_i1imm $slc), 0)
- >;
-
- def : Pat<
- (name vt:$vdata, v4i32:$rsrc, 0,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (as_i1imm $glc),
- (as_i1imm $slc), 0)
- >;
-
- def : Pat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _BOTHEN_exact)
- $vdata,
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset),
- (as_i1imm $glc), (as_i1imm $slc), 0)
- >;
-}
-
-defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
-defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
-defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
-defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
-
-//===----------------------------------------------------------------------===//
-// buffer_atomic patterns
-//===----------------------------------------------------------------------===//
-multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
- def : Pat<
- (name i32:$vdata_in, v4i32:$rsrc, 0,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$slc),
- (!cast<MUBUF>(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset,
- (as_i16imm $offset), (as_i1imm $slc))
- >;
-
- def : Pat<
- (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$slc),
- (!cast<MUBUF>(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (as_i1imm $slc))
- >;
-
- def : Pat<
- (name i32:$vdata_in, v4i32:$rsrc, 0,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$slc),
- (!cast<MUBUF>(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (as_i1imm $slc))
- >;
-
- def : Pat<
- (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$slc),
- (!cast<MUBUF>(opcode # _RTN_BOTHEN)
- $vdata_in,
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc))
- >;
-}
-
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_swap, "BUFFER_ATOMIC_SWAP">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_add, "BUFFER_ATOMIC_ADD">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_sub, "BUFFER_ATOMIC_SUB">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smin, "BUFFER_ATOMIC_SMIN">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umin, "BUFFER_ATOMIC_UMIN">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smax, "BUFFER_ATOMIC_SMAX">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umax, "BUFFER_ATOMIC_UMAX">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_and, "BUFFER_ATOMIC_AND">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_or, "BUFFER_ATOMIC_OR">;
-defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_xor, "BUFFER_ATOMIC_XOR">;
-
def : Pat<
- (int_amdgcn_buffer_atomic_cmpswap
- i32:$data, i32:$cmp, v4i32:$rsrc, 0,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$slc),
- (EXTRACT_SUBREG
- (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET
- (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
- sub0)
+ (int_amdgcn_else i64:$src, bb:$target),
+ (SI_ELSE $src, $target, 0)
>;
-def : Pat<
- (int_amdgcn_buffer_atomic_cmpswap
- i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$slc),
- (EXTRACT_SUBREG
- (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN
- (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
- $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
- sub0)
->;
-
-def : Pat<
- (int_amdgcn_buffer_atomic_cmpswap
- i32:$data, i32:$cmp, v4i32:$rsrc, 0,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$slc),
- (EXTRACT_SUBREG
- (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN
- (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
- $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
- sub0)
->;
-
-def : Pat<
- (int_amdgcn_buffer_atomic_cmpswap
- i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$slc),
- (EXTRACT_SUBREG
- (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN
- (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
- sub0)
->;
-
-
-//===----------------------------------------------------------------------===//
-// S_GETREG_B32 Intrinsic Pattern.
-//===----------------------------------------------------------------------===//
def : Pat <
- (int_amdgcn_s_getreg imm:$simm16),
- (S_GETREG_B32 (as_i16imm $simm16))
->;
-
-//===----------------------------------------------------------------------===//
-// DS_SWIZZLE Intrinsic Pattern.
-//===----------------------------------------------------------------------===//
-def : Pat <
- (int_amdgcn_ds_swizzle i32:$src, imm:$offset16),
- (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
->;
-
-//===----------------------------------------------------------------------===//
-// SMRD Patterns
-//===----------------------------------------------------------------------===//
-
-multiclass SMRD_Pattern <string Instr, ValueType vt> {
-
- // 1. IMM offset
- def : Pat <
- (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
- (vt (!cast<SMRD>(Instr#"_IMM") $sbase, $offset))
- >;
-
- // 2. SGPR offset
- def : Pat <
- (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
- (vt (!cast<SMRD>(Instr#"_SGPR") $sbase, $offset))
- >;
-
- def : Pat <
- (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
- (vt (!cast<SMRD>(Instr#"_IMM_ci") $sbase, $offset))
- > {
- let Predicates = [isCIOnly];
- }
-}
-
-// Global and constant loads can be selected to either MUBUF or SMRD
-// instructions, but SMRD instructions are faster so we want the instruction
-// selector to prefer those.
-let AddedComplexity = 100 in {
-
-defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
-
-// 1. Offset as an immediate
-def : Pat <
- (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
- (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset)
->;
-
-// 2. Offset loaded in an 32bit SGPR
-def : Pat <
- (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)),
- (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset)
->;
-
-let Predicates = [isCI] in {
-
-def : Pat <
- (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
- (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset)
->;
-
-} // End Predicates = [isCI]
-
-} // End let AddedComplexity = 10000
-
-//===----------------------------------------------------------------------===//
-// SOP1 Patterns
-//===----------------------------------------------------------------------===//
-
-def : Pat <
- (i64 (ctpop i64:$src)),
- (i64 (REG_SEQUENCE SReg_64,
- (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
- (S_MOV_B32 0), sub1))
->;
-
-def : Pat <
- (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
- (S_ABS_I32 $x)
->;
-
-//===----------------------------------------------------------------------===//
-// SOP2 Patterns
-//===----------------------------------------------------------------------===//
-
-// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
-// case, the sgpr-copies pass will fix this to use the vector version.
-def : Pat <
- (i32 (addc i32:$src0, i32:$src1)),
- (S_ADD_U32 $src0, $src1)
->;
-
-//===----------------------------------------------------------------------===//
-// SOPP Patterns
-//===----------------------------------------------------------------------===//
-
-def : Pat <
- (int_amdgcn_s_waitcnt i32:$simm16),
- (S_WAITCNT (as_i16imm $simm16))
+ (int_AMDGPU_kilp),
+ (SI_KILL (i32 0xbf800000))
>;
//===----------------------------------------------------------------------===//
@@ -2483,308 +423,79 @@ def : Pat <
} // End Predicates = [UnsafeFPMath]
-//===----------------------------------------------------------------------===//
-// VOP2 Patterns
-//===----------------------------------------------------------------------===//
-
def : Pat <
- (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
- (V_BCNT_U32_B32_e64 $popcnt, $val)
+ (f32 (fpextend f16:$src)),
+ (V_CVT_F32_F16_e32 $src)
>;
def : Pat <
- (i32 (select i1:$src0, i32:$src1, i32:$src2)),
- (V_CNDMASK_B32_e64 $src2, $src1, $src0)
+ (f64 (fpextend f16:$src)),
+ (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
-// Pattern for V_MAC_F32
def : Pat <
- (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
- (VOP3NoMods f32:$src1, i32:$src1_modifiers),
- (VOP3NoMods f32:$src2, i32:$src2_modifiers)),
- (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
- $src2_modifiers, $src2, $clamp, $omod)
->;
-
-/********** ======================= **********/
-/********** Image sampling patterns **********/
-/********** ======================= **********/
-
-// Image + sampler
-class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
- i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
- (opcode $addr, $rsrc, $sampler,
- (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
- (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
->;
-
-multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>;
- def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>;
-}
-
-// Image only
-class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
- (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm,
- imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe),
- (opcode $addr, $rsrc,
- (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
- (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
->;
-
-multiclass ImagePatterns<SDPatternOperator name, string opcode> {
- def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
- def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
- def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
-}
-
-class ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
- (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$r128, imm:$da, imm:$glc,
- imm:$slc),
- (opcode $addr, $rsrc,
- (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
- (as_i1imm $r128), 0, 0, (as_i1imm $da))
->;
-
-multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
- def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
- def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
- def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
-}
-
-class ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
- (name v4f32:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, imm:$r128, imm:$da,
- imm:$glc, imm:$slc),
- (opcode $data, $addr, $rsrc,
- (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
- (as_i1imm $r128), 0, 0, (as_i1imm $da))
+ (f16 (fpround f32:$src)),
+ (V_CVT_F16_F32_e32 $src)
>;
-multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
- def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
- def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
- def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
-}
-
-class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
- (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc),
- (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da))
->;
-
-multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> {
- def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>;
- def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>;
- def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>;
-}
-
-class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat <
- (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc,
- imm:$r128, imm:$da, imm:$slc),
- (EXTRACT_SUBREG
- (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1),
- $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)),
- sub0)
->;
-
-// Basic sample
-defm : SampleRawPatterns<int_SI_image_sample, "IMAGE_SAMPLE">;
-defm : SampleRawPatterns<int_SI_image_sample_cl, "IMAGE_SAMPLE_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_d, "IMAGE_SAMPLE_D">;
-defm : SampleRawPatterns<int_SI_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_l, "IMAGE_SAMPLE_L">;
-defm : SampleRawPatterns<int_SI_image_sample_b, "IMAGE_SAMPLE_B">;
-defm : SampleRawPatterns<int_SI_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_lz, "IMAGE_SAMPLE_LZ">;
-defm : SampleRawPatterns<int_SI_image_sample_cd, "IMAGE_SAMPLE_CD">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">;
-
-// Sample with comparison
-defm : SampleRawPatterns<int_SI_image_sample_c, "IMAGE_SAMPLE_C">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d, "IMAGE_SAMPLE_C_D">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_l, "IMAGE_SAMPLE_C_L">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b, "IMAGE_SAMPLE_C_B">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">;
-
-// Sample with offsets
-defm : SampleRawPatterns<int_SI_image_sample_o, "IMAGE_SAMPLE_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_d_o, "IMAGE_SAMPLE_D_O">;
-defm : SampleRawPatterns<int_SI_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_l_o, "IMAGE_SAMPLE_L_O">;
-defm : SampleRawPatterns<int_SI_image_sample_b_o, "IMAGE_SAMPLE_B_O">;
-defm : SampleRawPatterns<int_SI_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">;
-
-// Sample with comparison and offsets
-defm : SampleRawPatterns<int_SI_image_sample_c_o, "IMAGE_SAMPLE_C_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
-
-// Gather opcodes
-// Only the variants which make sense are defined.
-def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>;
-def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl, IMAGE_GATHER4_CL_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_l, IMAGE_GATHER4_L_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_b, IMAGE_GATHER4_B_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V2, v2i32>;
-def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V4, v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_c, IMAGE_GATHER4_C_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_cl, IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz, IMAGE_GATHER4_C_LZ_V4_V4, v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_o, IMAGE_GATHER4_O_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl_o, IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_lz_o, IMAGE_GATHER4_LZ_O_V4_V4, v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl_o, IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_l_o, IMAGE_GATHER4_C_L_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_o, IMAGE_GATHER4_C_B_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_cl_o, IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
-
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>;
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
-
-def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
-defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
-defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
-defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">;
-defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">;
-defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">;
-defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>;
-def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">;
-defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">;
-
-/* SIsample for simple 1D texture lookup */
def : Pat <
- (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
- (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
+ (f16 (fpround f64:$src)),
+ (V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src))
>;
-class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
- (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
->;
-
-class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT),
- (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0)
+def : Pat <
+ (i32 (fp_to_sint f16:$src)),
+ (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
-class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY),
- (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
+def : Pat <
+ (i32 (fp_to_uint f16:$src)),
+ (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
-class SampleShadowPattern<SDNode name, MIMG opcode,
- ValueType vt> : Pat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW),
- (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
+def : Pat <
+ (f16 (sint_to_fp i32:$src)),
+ (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src))
>;
-class SampleShadowArrayPattern<SDNode name, MIMG opcode,
- ValueType vt> : Pat <
- (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
- (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
+def : Pat <
+ (f16 (uint_to_fp i32:$src)),
+ (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src))
>;
-/* SIsample* for texture lookups consuming more address parameters */
-multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
- MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
-MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
- def : SamplePattern <SIsample, sample, addr_type>;
- def : SampleRectPattern <SIsample, sample, addr_type>;
- def : SampleArrayPattern <SIsample, sample, addr_type>;
- def : SampleShadowPattern <SIsample, sample_c, addr_type>;
- def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
+//===----------------------------------------------------------------------===//
+// VOP2 Patterns
+//===----------------------------------------------------------------------===//
- def : SamplePattern <SIsamplel, sample_l, addr_type>;
- def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
- def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
- def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
+multiclass FMADPat <ValueType vt, Instruction inst> {
+ def : Pat <
+ (vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+ (VOP3NoMods vt:$src1, i32:$src1_modifiers),
+ (VOP3NoMods vt:$src2, i32:$src2_modifiers))),
+ (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+ $src2_modifiers, $src2, $clamp, $omod)
+ >;
+}
- def : SamplePattern <SIsampleb, sample_b, addr_type>;
- def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
- def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
- def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
+defm : FMADPat <f16, V_MAC_F16_e64>;
+defm : FMADPat <f32, V_MAC_F32_e64>;
- def : SamplePattern <SIsampled, sample_d, addr_type>;
- def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
- def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
- def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
+multiclass SelectPat <ValueType vt, Instruction inst> {
+ def : Pat <
+ (vt (select i1:$src0, vt:$src1, vt:$src2)),
+ (inst $src2, $src1, $src0)
+ >;
}
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
- IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
- IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
- IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
- v2i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
- IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
- IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
- IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
- v4i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
- IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
- IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
- IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
- v8i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
- IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
- IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
- IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
- v16i32>;
+defm : SelectPat <i16, V_CNDMASK_B32_e64>;
+defm : SelectPat <i32, V_CNDMASK_B32_e64>;
+defm : SelectPat <f16, V_CNDMASK_B32_e64>;
+defm : SelectPat <f32, V_CNDMASK_B32_e64>;
+
+def : Pat <
+ (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
+ (V_BCNT_U32_B32_e64 $popcnt, $val)
+>;
/********** ============================================ **********/
/********** Extraction, Insertion, Building and Casting **********/
@@ -2856,6 +567,12 @@ foreach Index = 0-15 in {
// FIXME: Why do only some of these type combinations for SReg and
// VReg?
+// 16-bit bitcast
+def : BitConvert <i16, f16, VGPR_32>;
+def : BitConvert <f16, i16, VGPR_32>;
+def : BitConvert <i16, f16, SReg_32>;
+def : BitConvert <f16, i16, SReg_32>;
+
// 32-bit bitcast
def : BitConvert <i32, f32, VGPR_32>;
def : BitConvert <f32, i32, VGPR_32>;
@@ -2905,7 +622,7 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
def : Pat <
(AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
(f32 FP_ZERO), (f32 FP_ONE)),
- (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod)
+ (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
>;
/********** ================================ **********/
@@ -2916,7 +633,7 @@ def : Pat <
def : Pat <
(fneg (fabs f32:$src)),
- (S_OR_B32 $src, 0x80000000) // Set sign bit
+ (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
>;
// FIXME: Should use S_OR_B32
@@ -2925,19 +642,19 @@ def : Pat <
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
- (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
- (V_MOV_B32_e32 0x80000000)), // Set sign bit.
+ (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+ (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
sub1)
>;
def : Pat <
(fabs f32:$src),
- (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff))
+ (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
>;
def : Pat <
(fneg f32:$src),
- (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000))
+ (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
>;
def : Pat <
@@ -2945,8 +662,8 @@ def : Pat <
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
- (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
- (V_MOV_B32_e32 0x7fffffff)), // Set sign bit.
+ (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+ (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
sub1)
>;
@@ -2955,33 +672,66 @@ def : Pat <
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
- (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
- (V_MOV_B32_e32 0x80000000)),
+ (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+ (i32 (V_MOV_B32_e32 (i32 0x80000000)))),
sub1)
>;
+def : Pat <
+ (fneg f16:$src),
+ (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
+>;
+
+def : Pat <
+ (fabs f16:$src),
+ (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
+>;
+
+def : Pat <
+ (fneg (fabs f16:$src)),
+ (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
+>;
+
/********** ================== **********/
/********** Immediate Patterns **********/
/********** ================== **********/
def : Pat <
- (SGPRImm<(i32 imm)>:$imm),
- (S_MOV_B32 imm:$imm)
+ (VGPRImm<(i32 imm)>:$imm),
+ (V_MOV_B32_e32 imm:$imm)
>;
def : Pat <
- (SGPRImm<(f32 fpimm)>:$imm),
- (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
+ (VGPRImm<(f32 fpimm)>:$imm),
+ (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
(i32 imm:$imm),
- (V_MOV_B32_e32 imm:$imm)
+ (S_MOV_B32 imm:$imm)
+>;
+
+// FIXME: Workaround for ordering issue with peephole optimizer where
+// a register class copy interferes with immediate folding. Should
+// use s_mov_b32, which can be shrunk to s_movk_i32
+def : Pat <
+ (VGPRImm<(f16 fpimm)>:$imm),
+ (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
(f32 fpimm:$imm),
- (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
+ (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
+ (f16 fpimm:$imm),
+ (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
+ (i32 frameindex:$fi),
+ (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
>;
def : Pat <
@@ -3011,21 +761,21 @@ def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
def : Pat <
(int_AMDGPU_cube v4f32:$src),
(REG_SEQUENCE VReg_128,
- (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
- 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1),
- 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2),
+ (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
+ 0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)),
+ 0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub0,
- (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
- 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
- 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2),
+ (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
+ 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
+ 0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub1,
- (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
- 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
- 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
+ (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
+ 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
+ 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub2,
- (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
- 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
- 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
+ (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
+ 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
+ 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub3)
>;
@@ -3042,17 +792,11 @@ class Ext32Pat <SDNode ext> : Pat <
def : Ext32Pat <zext>;
def : Ext32Pat <anyext>;
-// Offset in an 32-bit VGPR
-def : Pat <
- (SIload_constant v4i32:$sbase, i32:$voff),
- (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0)
->;
-
// The multiplication scales from [0,1] to the unsigned integer range
def : Pat <
(AMDGPUurecip i32:$src0),
(V_CVT_U32_F32_e32
- (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1,
+ (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
(V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
>;
@@ -3066,245 +810,8 @@ def : UMad24Pat<V_MAD_U32_U24>;
defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : ROTRPattern <V_ALIGNBIT_B32>;
-/********** ======================= **********/
-/********** Load/Store Patterns **********/
-/********** ======================= **********/
-
-class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
- (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
- (inst $ptr, (as_i16imm $offset), (i1 0))
->;
-
-def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>;
-def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>;
-def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
-def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>;
-def : DSReadPat <DS_READ_B32, i32, si_load_local>;
-
-let AddedComplexity = 100 in {
-
-def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>;
-
-} // End AddedComplexity = 100
-
-def : Pat <
- (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
- i8:$offset1))),
- (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
->;
-
-class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
- (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
- (inst $ptr, $value, (as_i16imm $offset), (i1 0))
->;
-
-def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>;
-def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>;
-def : DSWritePat <DS_WRITE_B32, i32, si_store_local>;
-
-let AddedComplexity = 100 in {
-
-def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>;
-} // End AddedComplexity = 100
-
-def : Pat <
- (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
- i8:$offset1)),
- (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0),
- (EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
- (i1 0))
->;
-
-class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
- (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
- (inst $ptr, $value, (as_i16imm $offset), (i1 0))
->;
-
-class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
- (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
- (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
->;
-
-
-// 32-bit atomics.
-def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
-def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
-def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>;
-def : DSAtomicRetPat<DS_INC_RTN_U32, i32, si_atomic_inc_local>;
-def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, si_atomic_dec_local>;
-def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>;
-def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>;
-def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>;
-def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
-
-// 64-bit atomics.
-def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
-def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
-def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>;
-def : DSAtomicRetPat<DS_INC_RTN_U64, i64, si_atomic_inc_local>;
-def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, si_atomic_dec_local>;
-def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>;
-def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>;
-def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>;
-
-def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>;
-
-
-//===----------------------------------------------------------------------===//
-// MUBUF Patterns
-//===----------------------------------------------------------------------===//
-
-class MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
- PatFrag constant_ld> : Pat <
- (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
- >;
-
-multiclass MUBUFLoad_Atomic_Pattern <MUBUF Instr_ADDR64, MUBUF Instr_OFFSET,
- ValueType vt, PatFrag atomic_ld> {
- def : Pat <
- (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$slc))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
- >;
-
- def : Pat <
- (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
- (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
- >;
-}
-
-let Predicates = [isSICI] in {
-def : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
-def : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
-def : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
-def : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
-
-defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>;
-defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>;
-} // End Predicates = [isSICI]
-
-class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
- (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset))),
- (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
->;
-
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
-
-// BUFFER_LOAD_DWORD*, addr64=0
-multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
- MUBUF bothen> {
-
- def : Pat <
- (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset,
- imm:$offset, 0, 0, imm:$glc, imm:$slc,
- imm:$tfe)),
- (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
- (as_i1imm $slc), (as_i1imm $tfe))
- >;
-
- def : Pat <
- (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
- imm:$offset, 1, 0, imm:$glc, imm:$slc,
- imm:$tfe)),
- (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
- (as_i1imm $tfe))
- >;
-
- def : Pat <
- (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
- imm:$offset, 0, 1, imm:$glc, imm:$slc,
- imm:$tfe)),
- (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
- (as_i1imm $slc), (as_i1imm $tfe))
- >;
-
- def : Pat <
- (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
- imm:$offset, 1, 1, imm:$glc, imm:$slc,
- imm:$tfe)),
- (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
- (as_i1imm $tfe))
- >;
-}
-
-defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
- BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
-defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
- BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
-defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
- BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
-
-multiclass MUBUFStore_Atomic_Pattern <MUBUF Instr_ADDR64, MUBUF Instr_OFFSET,
- ValueType vt, PatFrag atomic_st> {
- // Store follows atomic op convention so address is forst
- def : Pat <
- (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$slc), vt:$val),
- (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
- >;
-
- def : Pat <
- (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
- (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
- >;
-}
-let Predicates = [isSICI] in {
-defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, global_store_atomic>;
-defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>;
-} // End Predicates = [isSICI]
-
-class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
- (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
- u16imm:$offset)),
- (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
->;
-
-def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
-
-//===----------------------------------------------------------------------===//
-// MTBUF Patterns
-//===----------------------------------------------------------------------===//
-
-// TBUFFER_STORE_FORMAT_*, addr64=0
-class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat<
- (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
- i32:$soffset, imm:$inst_offset, imm:$dfmt,
- imm:$nfmt, imm:$offen, imm:$idxen,
- imm:$glc, imm:$slc, imm:$tfe),
- (opcode
- $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
- (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
- (as_i1imm $slc), (as_i1imm $tfe), $soffset)
->;
-
-def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
-def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
-def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
-def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
-
/********** ====================== **********/
-/********** Indirect adressing **********/
+/********** Indirect addressing **********/
/********** ====================== **********/
multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
@@ -3332,48 +839,80 @@ defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
//===----------------------------------------------------------------------===//
+// SAD Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+ (add (sub_oneuse (umax i32:$src0, i32:$src1),
+ (umin i32:$src0, i32:$src1)),
+ i32:$src2),
+ (V_SAD_U32 $src0, $src1, $src2)
+>;
+
+def : Pat <
+ (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)),
+ (sub i32:$src0, i32:$src1),
+ (sub i32:$src1, i32:$src0)),
+ i32:$src2),
+ (V_SAD_U32 $src0, $src1, $src2)
+>;
+
+//===----------------------------------------------------------------------===//
// Conversion Patterns
//===----------------------------------------------------------------------===//
def : Pat<(i32 (sext_inreg i32:$src, i1)),
- (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
+ (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
// Handle sext_inreg in i64
def : Pat <
(i64 (sext_inreg i64:$src, i1)),
- (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16
+ (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
+>;
+
+def : Pat <
+ (i16 (sext_inreg i16:$src, i1)),
+ (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
+>;
+
+def : Pat <
+ (i16 (sext_inreg i16:$src, i8)),
+ (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i8)),
- (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16
+ (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i16)),
- (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16
+ (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i32)),
- (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16
+ (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
>;
-class ZExt_i64_i32_Pat <SDNode ext> : Pat <
- (i64 (ext i32:$src)),
- (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1)
+def : Pat <
+ (i64 (zext i32:$src)),
+ (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
+>;
+
+def : Pat <
+ (i64 (anyext i32:$src)),
+ (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
>;
class ZExt_i64_i1_Pat <SDNode ext> : Pat <
(i64 (ext i1:$src)),
(REG_SEQUENCE VReg_64,
(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
- (S_MOV_B32 0), sub1)
+ (S_MOV_B32 (i32 0)), sub1)
>;
-def : ZExt_i64_i32_Pat<zext>;
-def : ZExt_i64_i32_Pat<anyext>;
def : ZExt_i64_i1_Pat<zext>;
def : ZExt_i64_i1_Pat<anyext>;
@@ -3382,29 +921,29 @@ def : ZExt_i64_i1_Pat<anyext>;
def : Pat <
(i64 (sext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0,
- (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1)
+ (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
>;
def : Pat <
(i64 (sext i1:$src)),
(REG_SEQUENCE VReg_64,
- (V_CNDMASK_B32_e64 0, -1, $src), sub0,
- (V_CNDMASK_B32_e64 0, -1, $src), sub1)
+ (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
+ (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
>;
-class FPToI1Pat<Instruction Inst, int KOne, ValueType vt, SDPatternOperator fp_to_int> : Pat <
+class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
(i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
- (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+ (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
>;
-def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, f32, fp_to_uint>;
-def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, f32, fp_to_sint>;
-def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, f64, fp_to_uint>;
-def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, f64, fp_to_sint>;
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
// If we need to perform a logical operation on i1 values, we need to
// use vector comparisons since there is only one SCC register. Vector
-// comparisions still write to a pair of SGPRs, so treat these as
+// comparisons still write to a pair of SGPRs, so treat these as
// 64-bit comparisons. When legalizing SGPR copies, instructions
// resulting in the copies from SCC to these instructions will be
// moved to the VALU.
@@ -3425,12 +964,12 @@ def : Pat <
def : Pat <
(f32 (sint_to_fp i1:$src)),
- (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
+ (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
>;
def : Pat <
(f32 (uint_to_fp i1:$src)),
- (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src)
+ (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
>;
def : Pat <
@@ -3454,25 +993,25 @@ def : Pat <
def : Pat <
(i1 (trunc i32:$a)),
- (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), $a), 1)
+ (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
>;
def : Pat <
- (i1 (trunc i64:$a)),
- (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1),
- (EXTRACT_SUBREG $a, sub0)), 1)
+ (i1 (trunc i16:$a)),
+ (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
>;
def : Pat <
- (i32 (bswap i32:$a)),
- (V_BFI_B32 (S_MOV_B32 0x00ff00ff),
- (V_ALIGNBIT_B32 $a, $a, 24),
- (V_ALIGNBIT_B32 $a, $a, 8))
+ (i1 (trunc i64:$a)),
+ (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
+ (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
>;
def : Pat <
- (f32 (select i1:$src2, f32:$src1, f32:$src0)),
- (V_CNDMASK_B32_e64 $src0, $src1, $src2)
+ (i32 (bswap i32:$a)),
+ (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
+ (V_ALIGNBIT_B32 $a, $a, (i32 24)),
+ (V_ALIGNBIT_B32 $a, $a, (i32 8)))
>;
multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
@@ -3483,7 +1022,7 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
def : Pat <
(vt (add (vt (shl 1, vt:$a)), -1)),
- (BFM $a, (MOV 0))
+ (BFM $a, (MOV (i32 0)))
>;
}
@@ -3492,16 +1031,14 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
def : BFEPattern <V_BFE_U32, S_MOV_B32>;
-let Predicates = [isSICI] in {
-def : Pat <
- (i64 (readcyclecounter)),
- (S_MEMTIME)
+def : Pat<
+ (fcanonicalize f16:$src),
+ (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0)
>;
-}
def : Pat<
(fcanonicalize f32:$src),
- (V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0)
+ (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
>;
def : Pat<
@@ -3536,7 +1073,7 @@ def : Pat <
(V_MOV_B64_PSEUDO 0x3fefffffffffffff),
DSTCLAMP.NONE, DSTOMOD.NONE),
$x,
- (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)),
+ (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))),
DSTCLAMP.NONE, DSTOMOD.NONE)
>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
index 9d06ccf..5da3754 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
@@ -15,7 +15,20 @@
let TargetPrefix = "SI", isTarget = 1 in {
def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
- def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
+
+ def int_SI_export : Intrinsic <[],
+ [llvm_i32_ty, // en
+ llvm_i32_ty, // vm (FIXME: should be i1)
+ llvm_i32_ty, // done (FIXME: should be i1)
+ llvm_i32_ty, // tgt
+ llvm_i32_ty, // compr (FIXME: should be i1)
+ llvm_float_ty, // src0
+ llvm_float_ty, // src1
+ llvm_float_ty, // src2
+ llvm_float_ty], // src3
+ []
+ >;
+
def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
@@ -186,11 +199,11 @@ let TargetPrefix = "amdgcn", isTarget = 1 in {
/* Control flow Intrinsics */
- def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
- def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
- def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
- def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
- def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
- def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
- def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
+ def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], [IntrConvergent]>;
+ def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
+ def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
+ def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
+ def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
+ def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
+ def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 9e972a5..99fe96c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -60,31 +60,35 @@ private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
- LiveIntervals *LIS;
+ AliasAnalysis *AA;
static bool offsetsCanBeCombined(unsigned Offset0,
unsigned Offset1,
unsigned EltSize);
- MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
- unsigned EltSize);
+ MachineBasicBlock::iterator findMatchingDSInst(
+ MachineBasicBlock::iterator I,
+ unsigned EltSize,
+ SmallVectorImpl<MachineInstr*> &InstsToMove);
MachineBasicBlock::iterator mergeRead2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize);
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove);
MachineBasicBlock::iterator mergeWrite2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize);
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove);
public:
static char ID;
SILoadStoreOptimizer()
: MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
- LIS(nullptr) {}
+ AA(nullptr) {}
SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
@@ -94,16 +98,11 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
- return "SI Load / Store Optimizer";
- }
+ StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addPreserved<SlotIndexes>();
- AU.addPreserved<LiveIntervals>();
- AU.addPreserved<LiveVariables>();
- AU.addRequired<LiveIntervals>();
+ AU.addRequired<AAResultsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -113,9 +112,7 @@ public:
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load / Store Optimizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(LiveVariables)
-INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load / Store Optimizer", false, false)
@@ -127,6 +124,73 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
return new SILoadStoreOptimizer(TM);
}
+static void moveInstsAfter(MachineBasicBlock::iterator I,
+ ArrayRef<MachineInstr*> InstsToMove) {
+ MachineBasicBlock *MBB = I->getParent();
+ ++I;
+ for (MachineInstr *MI : InstsToMove) {
+ MI->removeFromParent();
+ MBB->insert(I, MI);
+ }
+}
+
+static void addDefsToList(const MachineInstr &MI,
+ SmallVectorImpl<const MachineOperand *> &Defs) {
+ for (const MachineOperand &Def : MI.defs()) {
+ Defs.push_back(&Def);
+ }
+}
+
+static bool memAccessesCanBeReordered(
+ MachineBasicBlock::iterator A,
+ MachineBasicBlock::iterator B,
+ const SIInstrInfo *TII,
+ llvm::AliasAnalysis * AA) {
+ return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) ||
+ // RAW or WAR - cannot reorder
+ // WAW - cannot reorder
+ // RAR - safe to reorder
+ !(A->mayStore() || B->mayStore()));
+}
+
+// Add MI and its defs to the lists if MI reads one of the defs that are
+// already in the list. Returns true in that case.
+static bool
+addToListsIfDependent(MachineInstr &MI,
+ SmallVectorImpl<const MachineOperand *> &Defs,
+ SmallVectorImpl<MachineInstr*> &Insts) {
+ for (const MachineOperand *Def : Defs) {
+ bool ReadDef = MI.readsVirtualRegister(Def->getReg());
+ // If ReadDef is true, then there is a use of Def between I
+ // and the instruction that I will potentially be merged with. We
+ // will need to move this instruction after the merged instructions.
+ if (ReadDef) {
+ Insts.push_back(&MI);
+ addDefsToList(MI, Defs);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool
+canMoveInstsAcrossMemOp(MachineInstr &MemOp,
+ ArrayRef<MachineInstr*> InstsToMove,
+ const SIInstrInfo *TII,
+ AliasAnalysis *AA) {
+
+ assert(MemOp.mayLoadOrStore());
+
+ for (MachineInstr *InstToMove : InstsToMove) {
+ if (!InstToMove->mayLoadOrStore())
+ continue;
+ if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
+ return false;
+ }
+ return true;
+}
+
bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
unsigned Offset1,
unsigned Size) {
@@ -156,43 +220,99 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
MachineBasicBlock::iterator
SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
- unsigned EltSize){
+ unsigned EltSize,
+ SmallVectorImpl<MachineInstr*> &InstsToMove) {
MachineBasicBlock::iterator E = I->getParent()->end();
MachineBasicBlock::iterator MBBI = I;
++MBBI;
- if (MBBI->getOpcode() != I->getOpcode())
- return E;
-
- // Don't merge volatiles.
- if (MBBI->hasOrderedMemoryRef())
- return E;
-
- int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
- const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
- const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
-
- // Check same base pointer. Be careful of subregisters, which can occur with
- // vectors of pointers.
- if (AddrReg0.getReg() == AddrReg1.getReg() &&
- AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
- int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
- AMDGPU::OpName::offset);
- unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
- unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
-
- // Check both offsets fit in the reduced range.
- if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
- return MBBI;
- }
+ SmallVector<const MachineOperand *, 8> DefsToMove;
+ addDefsToList(*I, DefsToMove);
+ for ( ; MBBI != E; ++MBBI) {
+
+ if (MBBI->getOpcode() != I->getOpcode()) {
+
+ // This is not a matching DS instruction, but we can keep looking as
+ // long as one of these conditions are met:
+ // 1. It is safe to move I down past MBBI.
+ // 2. It is safe to move MBBI down past the instruction that I will
+ // be merged into.
+
+ if (MBBI->hasUnmodeledSideEffects())
+ // We can't re-order this instruction with respect to other memory
+ // opeations, so we fail both conditions mentioned above.
+ return E;
+
+ if (MBBI->mayLoadOrStore() &&
+ !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) {
+ // We fail condition #1, but we may still be able to satisfy condition
+ // #2. Add this instruction to the move list and then we will check
+ // if condition #2 holds once we have selected the matching instruction.
+ InstsToMove.push_back(&*MBBI);
+ addDefsToList(*MBBI, DefsToMove);
+ continue;
+ }
+
+ // When we match I with another DS instruction we will be moving I down
+ // to the location of the matched instruction any uses of I will need to
+ // be moved down as well.
+ addToListsIfDependent(*MBBI, DefsToMove, InstsToMove);
+ continue;
+ }
+
+ // Don't merge volatiles.
+ if (MBBI->hasOrderedMemoryRef())
+ return E;
+
+ // Handle a case like
+ // DS_WRITE_B32 addr, v, idx0
+ // w = DS_READ_B32 addr, idx0
+ // DS_WRITE_B32 addr, f(w), idx1
+ // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
+ // merging of the two writes.
+ if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove))
+ continue;
+
+ int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
+ const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
+ const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+
+ // Check same base pointer. Be careful of subregisters, which can occur with
+ // vectors of pointers.
+ if (AddrReg0.getReg() == AddrReg1.getReg() &&
+ AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+ int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
+ AMDGPU::OpName::offset);
+ unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
+ unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+
+ // Check both offsets fit in the reduced range.
+ // We also need to go through the list of instructions that we plan to
+ // move and make sure they are all safe to move down past the merged
+ // instruction.
+ if (offsetsCanBeCombined(Offset0, Offset1, EltSize) &&
+ canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA))
+ return MBBI;
+ }
+
+ // We've found a load/store that we couldn't merge for some reason.
+ // We could potentially keep looking, but we'd need to make sure that
+ // it was safe to move I and also all the instruction in InstsToMove
+ // down past this instruction.
+ if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) || // check if we can move I across MBBI
+ !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users
+ )
+ break;
+ }
return E;
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize) {
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
// Be careful, since the addresses could be subregisters themselves in weird
@@ -220,6 +340,15 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
}
+ unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+ unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
+
+ if (NewOffset0 > NewOffset1) {
+ // Canonicalize the merged instruction so the smaller offset comes first.
+ std::swap(NewOffset0, NewOffset1);
+ std::swap(SubRegIdx0, SubRegIdx1);
+ }
+
assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
(NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
@@ -232,62 +361,40 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
DebugLoc DL = I->getDebugLoc();
MachineInstrBuilder Read2
- = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
+ = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg)
.addOperand(*AddrReg) // addr
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
.addImm(0) // gds
.addMemOperand(*I->memoperands_begin())
.addMemOperand(*Paired->memoperands_begin());
-
- unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
- unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
+ (void)Read2;
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
// Copy to the old destination registers.
- MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc)
+ BuildMI(*MBB, Paired, DL, CopyDesc)
.addOperand(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc)
+ MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc)
.addOperand(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
- LIS->InsertMachineInstrInMaps(*Read2);
-
- // repairLiveintervalsInRange() doesn't handle physical register, so we have
- // to update the M0 range manually.
- SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
- LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
- LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
- bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
-
- // The new write to the original destination register is now the copy. Steal
- // the old SlotIndex.
- LIS->ReplaceMachineInstrInMaps(*I, *Copy0);
- LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1);
+ moveInstsAfter(Copy1, InstsToMove);
+ MachineBasicBlock::iterator Next = std::next(I);
I->eraseFromParent();
Paired->eraseFromParent();
- LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
- LIS->shrinkToUses(&AddrRegLI);
-
- LIS->createAndComputeVirtRegInterval(DestReg);
-
- if (UpdateM0Range) {
- SlotIndex Read2Index = LIS->getInstructionIndex(*Read2);
- M0Segment->end = Read2Index.getRegSlot();
- }
-
DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
- return Read2.getInstr();
+ return Next;
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize) {
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -316,6 +423,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
}
+ if (NewOffset0 > NewOffset1) {
+ // Canonicalize the merged instruction so the smaller offset comes first.
+ std::swap(NewOffset0, NewOffset1);
+ std::swap(Data0, Data1);
+ }
+
assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
(NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
@@ -323,15 +436,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = I->getDebugLoc();
- // repairLiveintervalsInRange() doesn't handle physical register, so we have
- // to update the M0 range manually.
- SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
- LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
- LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
- bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
-
MachineInstrBuilder Write2
- = BuildMI(*MBB, I, DL, Write2Desc)
+ = BuildMI(*MBB, Paired, DL, Write2Desc)
.addOperand(*Addr) // addr
.addOperand(*Data0) // data0
.addOperand(*Data1) // data1
@@ -341,24 +447,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
.addMemOperand(*I->memoperands_begin())
.addMemOperand(*Paired->memoperands_begin());
- // XXX - How do we express subregisters here?
- unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
+ moveInstsAfter(Write2, InstsToMove);
- LIS->RemoveMachineInstrFromMaps(*I);
- LIS->RemoveMachineInstrFromMaps(*Paired);
+ MachineBasicBlock::iterator Next = std::next(I);
I->eraseFromParent();
Paired->eraseFromParent();
- // This doesn't handle physical registers like M0
- LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
-
- if (UpdateM0Range) {
- SlotIndex Write2Index = LIS->getInstructionIndex(*Write2);
- M0Segment->end = Write2Index.getRegSlot();
- }
-
DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
- return Write2.getInstr();
+ return Next;
}
// Scan through looking for adjacent LDS operations with constant offsets from
@@ -376,13 +472,15 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
continue;
}
+ SmallVector<MachineInstr*, 8> InstsToMove;
unsigned Opc = MI.getOpcode();
if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
- MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+ MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+ InstsToMove);
if (Match != E) {
Modified = true;
- I = mergeRead2Pair(I, Match, Size);
+ I = mergeRead2Pair(I, Match, Size, InstsToMove);
} else {
++I;
}
@@ -390,10 +488,11 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
continue;
} else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
- MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+ MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+ InstsToMove);
if (Match != E) {
Modified = true;
- I = mergeWrite2Pair(I, Match, Size);
+ I = mergeWrite2Pair(I, Match, Size, InstsToMove);
} else {
++I;
}
@@ -419,13 +518,10 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
-
- LIS = &getAnalysis<LiveIntervals>();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
- assert(!MRI->isSSA());
-
bool Modified = false;
for (MachineBasicBlock &MBB : MF)
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index ee1d5da..7ed18f2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -58,7 +58,6 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Constants.h"
using namespace llvm;
@@ -68,63 +67,50 @@ namespace {
class SILowerControlFlow : public MachineFunctionPass {
private:
- static const unsigned SkipThreshold = 12;
-
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
+ LiveIntervals *LIS;
+ MachineRegisterInfo *MRI;
- bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
-
- void Skip(MachineInstr &From, MachineOperand &To);
- bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
-
- void If(MachineInstr &MI);
- void Else(MachineInstr &MI, bool ExecModified);
- void Break(MachineInstr &MI);
- void IfBreak(MachineInstr &MI);
- void ElseBreak(MachineInstr &MI);
- void Loop(MachineInstr &MI);
- void EndCf(MachineInstr &MI);
-
- void Kill(MachineInstr &MI);
- void Branch(MachineInstr &MI);
-
- MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const;
-
- std::pair<MachineBasicBlock *, MachineBasicBlock *>
- splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+ void emitIf(MachineInstr &MI);
+ void emitElse(MachineInstr &MI);
+ void emitBreak(MachineInstr &MI);
+ void emitIfBreak(MachineInstr &MI);
+ void emitElseBreak(MachineInstr &MI);
+ void emitLoop(MachineInstr &MI);
+ void emitEndCf(MachineInstr &MI);
- void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
- const MachineRegisterInfo &MRI,
- const MachineInstr &MI,
- MachineBasicBlock &LoopBB,
- MachineBasicBlock &RemainderBB,
- unsigned SaveReg,
- const MachineOperand &IdxReg);
+ void findMaskOperands(MachineInstr &MI, unsigned OpNo,
+ SmallVectorImpl<MachineOperand> &Src) const;
- void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
- MachineInstr *MovRel,
- const MachineOperand &IdxReg,
- int Offset);
-
- bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
- std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
- int Offset) const;
- bool indirectSrc(MachineInstr &MI);
- bool indirectDst(MachineInstr &MI);
+ void combineMasks(MachineInstr &MI);
public:
static char ID;
SILowerControlFlow() :
- MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
+ MachineFunctionPass(ID),
+ TRI(nullptr),
+ TII(nullptr),
+ LIS(nullptr),
+ MRI(nullptr) {}
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "SI Lower control flow pseudo instructions";
}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // Should preserve the same set that TwoAddressInstructions does.
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreservedID(LiveVariablesID);
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
};
} // End anonymous namespace
@@ -132,555 +118,283 @@ public:
char SILowerControlFlow::ID = 0;
INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
- "SI lower control flow", false, false)
+ "SI lower control flow", false, false)
-char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
+static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
+ MachineOperand &ImpDefSCC = MI.getOperand(3);
+ assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
-
-FunctionPass *llvm::createSILowerControlFlowPass() {
- return new SILowerControlFlow();
+ ImpDefSCC.setIsDead(IsDead);
}
-static bool opcodeEmitsNoInsts(unsigned Opc) {
- switch (Opc) {
- case TargetOpcode::IMPLICIT_DEF:
- case TargetOpcode::KILL:
- case TargetOpcode::BUNDLE:
- case TargetOpcode::CFI_INSTRUCTION:
- case TargetOpcode::EH_LABEL:
- case TargetOpcode::GC_LABEL:
- case TargetOpcode::DBG_VALUE:
- return true;
- default:
- return false;
- }
-}
-
-bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
- MachineBasicBlock *To) {
- if (From->succ_empty())
- return false;
-
- unsigned NumInstr = 0;
- MachineFunction *MF = From->getParent();
-
- for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
- MBBI != End && MBBI != ToI; ++MBBI) {
- MachineBasicBlock &MBB = *MBBI;
-
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- NumInstr < SkipThreshold && I != E; ++I) {
- if (opcodeEmitsNoInsts(I->getOpcode()))
- continue;
-
- // When a uniform loop is inside non-uniform control flow, the branch
- // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
- // when EXEC = 0. We should skip the loop lest it becomes infinite.
- if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
- I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
- return true;
-
- if (I->isInlineAsm()) {
- const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
- const char *AsmStr = I->getOperand(0).getSymbolName();
-
- // inlineasm length estimate is number of bytes assuming the longest
- // instruction.
- uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
- NumInstr += MaxAsmSize / MAI->getMaxInstLength();
- } else {
- ++NumInstr;
- }
+char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
- if (NumInstr >= SkipThreshold)
- return true;
- }
- }
-
- return false;
-}
-
-void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
-
- if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
- return;
-
- DebugLoc DL = From.getDebugLoc();
- BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
- .addOperand(To);
-}
-
-bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
+void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- MachineFunction *MF = MBB.getParent();
-
- if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
- !shouldSkip(&MBB, &MBB.getParent()->back()))
- return false;
-
- MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
- MBB.addSuccessor(SkipBB);
-
const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I(&MI);
- // If the exec mask is non-zero, skip the next two instructions
- BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addMBB(&NextBB);
-
- MachineBasicBlock::iterator Insert = SkipBB->begin();
-
- // Exec mask is zero: Export to NULL target...
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
- .addImm(0)
- .addImm(0x09) // V_008DFC_SQ_EXP_NULL
- .addImm(0)
- .addImm(1)
- .addImm(1)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef);
-
- // ... and terminate wavefront.
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
-
- return true;
-}
-
-void SILowerControlFlow::If(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- unsigned Reg = MI.getOperand(0).getReg();
- unsigned Vcc = MI.getOperand(1).getReg();
-
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
- .addReg(Vcc);
-
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
- .addReg(AMDGPU::EXEC)
- .addReg(Reg);
-
- Skip(MI, MI.getOperand(2));
+ MachineOperand &SaveExec = MI.getOperand(0);
+ MachineOperand &Cond = MI.getOperand(1);
+ assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
+ Cond.getSubReg() == AMDGPU::NoSubRegister);
- // Insert a pseudo terminator to help keep the verifier happy.
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
- .addOperand(MI.getOperand(2))
- .addReg(Reg);
+ unsigned SaveExecReg = SaveExec.getReg();
- MI.eraseFromParent();
-}
+ MachineOperand &ImpDefSCC = MI.getOperand(4);
+ assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
-void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
-
- BuildMI(MBB, MBB.getFirstNonPHI(), DL,
- TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
- .addReg(Src); // Saved EXEC
-
- if (ExecModified) {
- // Adjust the saved exec to account for the modifications during the flow
- // block that contains the ELSE. This can happen when WQM mode is switched
- // off.
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
- .addReg(AMDGPU::EXEC)
- .addReg(Dst);
+ // Add an implicit def of exec to discourage scheduling VALU after this which
+ // will interfere with trying to form s_and_saveexec_b64 later.
+ unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ MachineInstr *CopyExec =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
+ .addReg(AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC, RegState::ImplicitDefine);
+
+ unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ MachineInstr *And =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp)
+ .addReg(CopyReg)
+ //.addReg(AMDGPU::EXEC)
+ .addReg(Cond.getReg());
+ setImpSCCDefDead(*And, true);
+
+ MachineInstr *Xor =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
+ .addReg(Tmp)
+ .addReg(CopyReg);
+ setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
+
+ // Use a copy that is a terminator to get correct spill code placement it with
+ // fast regalloc.
+ MachineInstr *SetExec =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC)
+ .addReg(Tmp, RegState::Kill);
+
+ // Insert a pseudo terminator to help keep the verifier happy. This will also
+ // be used later when inserting skips.
+ MachineInstr *NewBr =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+ .addOperand(MI.getOperand(2));
+
+ if (!LIS) {
+ MI.eraseFromParent();
+ return;
}
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(Dst);
+ LIS->InsertMachineInstrInMaps(*CopyExec);
- Skip(MI, MI.getOperand(2));
+ // Replace with and so we don't need to fix the live interval for condition
+ // register.
+ LIS->ReplaceMachineInstrInMaps(MI, *And);
- // Insert a pseudo terminator to help keep the verifier happy.
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
- .addOperand(MI.getOperand(2))
- .addReg(Dst);
+ LIS->InsertMachineInstrInMaps(*Xor);
+ LIS->InsertMachineInstrInMaps(*SetExec);
+ LIS->InsertMachineInstrInMaps(*NewBr);
+ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
MI.eraseFromParent();
-}
-
-void SILowerControlFlow::Break(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
-
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
- .addReg(AMDGPU::EXEC)
- .addReg(Src);
-
- MI.eraseFromParent();
+ // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
+ // hard to add another def here but I'm not sure how to correctly update the
+ // valno.
+ LIS->removeInterval(SaveExecReg);
+ LIS->createAndComputeVirtRegInterval(SaveExecReg);
+ LIS->createAndComputeVirtRegInterval(Tmp);
+ LIS->createAndComputeVirtRegInterval(CopyReg);
}
-void SILowerControlFlow::IfBreak(MachineInstr &MI) {
+void SILowerControlFlow::emitElse(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
-
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Vcc = MI.getOperand(1).getReg();
- unsigned Src = MI.getOperand(2).getReg();
-
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
- .addReg(Vcc)
- .addReg(Src);
-
- MI.eraseFromParent();
-}
+ const DebugLoc &DL = MI.getDebugLoc();
-void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
+ unsigned DstReg = MI.getOperand(0).getReg();
+ assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Saved = MI.getOperand(1).getReg();
- unsigned Src = MI.getOperand(2).getReg();
+ bool ExecModified = MI.getOperand(3).getImm() != 0;
+ MachineBasicBlock::iterator Start = MBB.begin();
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
- .addReg(Saved)
- .addReg(Src);
+ // We are running before TwoAddressInstructions, and si_else's operands are
+ // tied. In order to correctly tie the registers, split this into a copy of
+ // the src like it does.
+ unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
+ .addOperand(MI.getOperand(1)); // Saved EXEC
- MI.eraseFromParent();
-}
+ // This must be inserted before phis and any spill code inserted before the
+ // else.
+ unsigned SaveReg = ExecModified ?
+ MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass) : DstReg;
+ MachineInstr *OrSaveExec =
+ BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), SaveReg)
+ .addReg(CopyReg);
-void SILowerControlFlow::Loop(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- unsigned Src = MI.getOperand(0).getReg();
+ MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(Src);
+ MachineBasicBlock::iterator ElsePt(MI);
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addOperand(MI.getOperand(1));
+ if (ExecModified) {
+ MachineInstr *And =
+ BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
+ .addReg(AMDGPU::EXEC)
+ .addReg(SaveReg);
- MI.eraseFromParent();
-}
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*And);
+ }
-void SILowerControlFlow::EndCf(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- unsigned Reg = MI.getOperand(0).getReg();
+ MachineInstr *Xor =
+ BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(DstReg);
- BuildMI(MBB, MBB.getFirstNonPHI(), DL,
- TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(Reg);
+ MachineInstr *Branch =
+ BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+ .addMBB(DestBB);
- MI.eraseFromParent();
-}
-
-void SILowerControlFlow::Branch(MachineInstr &MI) {
- MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
- if (MBB == MI.getParent()->getNextNode())
+ if (!LIS) {
MI.eraseFromParent();
-
- // If these aren't equal, this is probably an infinite loop.
-}
-
-void SILowerControlFlow::Kill(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- const MachineOperand &Op = MI.getOperand(0);
-
-#ifndef NDEBUG
- CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
- // Kill is only allowed in pixel / geometry shaders.
- assert(CallConv == CallingConv::AMDGPU_PS ||
- CallConv == CallingConv::AMDGPU_GS);
-#endif
-
- // Clear this thread from the exec mask if the operand is negative
- if ((Op.isImm())) {
- // Constant operand: Set exec mask to 0 or do nothing
- if (Op.getImm() & 0x80000000) {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
- .addImm(0);
- }
- } else {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
- .addImm(0)
- .addOperand(Op);
+ return;
}
+ LIS->RemoveMachineInstrFromMaps(MI);
MI.eraseFromParent();
-}
-// All currently live registers must remain so in the remainder block.
-void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
- const MachineRegisterInfo &MRI,
- const MachineInstr &MI,
- MachineBasicBlock &LoopBB,
- MachineBasicBlock &RemainderBB,
- unsigned SaveReg,
- const MachineOperand &IdxReg) {
- // Add reg defined in loop body.
- RemainderLiveRegs.addReg(SaveReg);
-
- if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
- if (!Val->isUndef()) {
- RemainderLiveRegs.addReg(Val->getReg());
- LoopBB.addLiveIn(Val->getReg());
- }
- }
+ LIS->InsertMachineInstrInMaps(*OrSaveExec);
- for (unsigned Reg : RemainderLiveRegs) {
- if (MRI.isAllocatable(Reg))
- RemainderBB.addLiveIn(Reg);
- }
+ LIS->InsertMachineInstrInMaps(*Xor);
+ LIS->InsertMachineInstrInMaps(*Branch);
- const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
- if (!Src->isUndef())
- LoopBB.addLiveIn(Src->getReg());
+ // src reg is tied to dst reg.
+ LIS->removeInterval(DstReg);
+ LIS->createAndComputeVirtRegInterval(DstReg);
+ LIS->createAndComputeVirtRegInterval(CopyReg);
+ if (ExecModified)
+ LIS->createAndComputeVirtRegInterval(SaveReg);
- if (!IdxReg.isUndef())
- LoopBB.addLiveIn(IdxReg.getReg());
- LoopBB.sortUniqueLiveIns();
+ // Let this be recomputed.
+ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
}
-void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
- DebugLoc DL,
- MachineInstr *MovRel,
- const MachineOperand &IdxReg,
- int Offset) {
- MachineBasicBlock::iterator I = LoopBB.begin();
-
- // Read the next variant into VCC (lower 32 bits) <- also loop target
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
- .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
-
- // Move index from VCC into M0
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addReg(AMDGPU::VCC_LO);
-
- // Compare the just read M0 value to all possible Idx values
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
- .addReg(AMDGPU::M0)
- .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
-
- // Update EXEC, save the original EXEC value to VCC
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
- .addReg(AMDGPU::VCC);
-
- if (Offset != 0) {
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
- .addReg(AMDGPU::M0)
- .addImm(Offset);
- }
-
- // Do the actual move
- LoopBB.insert(I, MovRel);
+void SILowerControlFlow::emitBreak(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Dst = MI.getOperand(0).getReg();
- // Update EXEC, switch all done bits to 0 and all todo bits to 1
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+ MachineInstr *Or =
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(AMDGPU::EXEC)
- .addReg(AMDGPU::VCC);
+ .addOperand(MI.getOperand(1));
- // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addMBB(&LoopBB);
+ if (LIS)
+ LIS->ReplaceMachineInstrInMaps(MI, *Or);
+ MI.eraseFromParent();
}
-MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
- MachineFunction *MF = MBB.getParent();
-
- MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
- MachineFunction::iterator MBBI(MBB);
- ++MBBI;
-
- MF->insert(MBBI, SkipBB);
-
- return SkipBB;
+void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
+ MI.setDesc(TII->get(AMDGPU::S_OR_B64));
}
-std::pair<MachineBasicBlock *, MachineBasicBlock *>
-SILowerControlFlow::splitBlock(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) {
- MachineFunction *MF = MBB.getParent();
-
- // To insert the loop we need to split the block. Move everything after this
- // point to a new block, and insert a new empty block between the two.
- MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
- MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
- MachineFunction::iterator MBBI(MBB);
- ++MBBI;
-
- MF->insert(MBBI, LoopBB);
- MF->insert(MBBI, RemainderBB);
-
- // Move the rest of the block into a new block.
- RemainderBB->transferSuccessors(&MBB);
- RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
-
- MBB.addSuccessor(LoopBB);
-
- return std::make_pair(LoopBB, RemainderBB);
+void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
+ MI.setDesc(TII->get(AMDGPU::S_OR_B64));
}
-// Returns true if a new block was inserted.
-bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
+void SILowerControlFlow::emitLoop(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- MachineBasicBlock::iterator I(&MI);
+ const DebugLoc &DL = MI.getDebugLoc();
- const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+ MachineInstr *AndN2 =
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(MI.getOperand(0));
- if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
- if (Offset != 0) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
- .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
- .addImm(Offset);
- } else {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()));
- }
+ MachineInstr *Branch =
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addOperand(MI.getOperand(1));
- MBB.insert(I, MovRel);
- MI.eraseFromParent();
- return false;
+ if (LIS) {
+ LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
+ LIS->InsertMachineInstrInMaps(*Branch);
}
- MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
- SaveOp->setIsDead(false);
- unsigned Save = SaveOp->getReg();
-
- // Reading from a VGPR requires looping over all workitems in the wavefront.
- assert(AMDGPU::SReg_64RegClass.contains(Save) &&
- AMDGPU::VGPR_32RegClass.contains(Idx->getReg()));
-
- // Save the EXEC mask
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
- .addReg(AMDGPU::EXEC);
-
- LivePhysRegs RemainderLiveRegs(TRI);
-
- RemainderLiveRegs.addLiveOuts(MBB);
-
- MachineBasicBlock *LoopBB;
- MachineBasicBlock *RemainderBB;
-
- std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I);
-
- for (const MachineInstr &Inst : reverse(*RemainderBB))
- RemainderLiveRegs.stepBackward(Inst);
-
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- LoopBB->addSuccessor(RemainderBB);
- LoopBB->addSuccessor(LoopBB);
-
- splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB,
- *RemainderBB, Save, *Idx);
-
- emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
-
- MachineBasicBlock::iterator First = RemainderBB->begin();
- BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
- .addReg(Save);
-
MI.eraseFromParent();
- return true;
-}
-
-/// \param @VecReg The register which holds element zero of the vector being
-/// addressed into.
-//
-/// \param[in] @Idx The index operand from the movrel instruction. This must be
-// a register, but may be NoRegister.
-///
-/// \param[in] @Offset As an input, this is the constant offset part of the
-// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant
-// value that needs to be added to the value stored in M0.
-std::pair<unsigned, int>
-SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const {
- unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
- if (!SubReg)
- SubReg = VecReg;
-
- const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
- const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
- int NumElts = SuperRC->getSize() / RC->getSize();
-
- int BaseRegIdx = TRI->getHWRegIndex(SubReg);
-
- // Skip out of bounds offsets, or else we would end up using an undefined
- // register.
- if (Offset >= NumElts)
- return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
-
- int RegIdx = BaseRegIdx + Offset;
- if (RegIdx < 0) {
- Offset = RegIdx;
- RegIdx = 0;
- } else {
- Offset = 0;
- }
-
- unsigned Reg = RC->getRegister(RegIdx);
- return std::make_pair(Reg, Offset);
}
-// Return true if a new block was inserted.
-bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
+void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- unsigned Dst = MI.getOperand(0).getReg();
- const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
- int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
- unsigned Reg;
-
- std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset);
+ MachineBasicBlock::iterator InsPt = MBB.begin();
+ MachineInstr *NewMI =
+ BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(MI.getOperand(0));
- const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
- if (Idx->getReg() == AMDGPU::NoRegister) {
- // Only had a constant offset, copy the register directly.
- BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
- .addReg(Reg, getUndefRegState(SrcVec->isUndef()));
- MI.eraseFromParent();
- return false;
- }
+ if (LIS)
+ LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
- MachineInstr *MovRel =
- BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
- .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
- .addReg(SrcVec->getReg(), RegState::Implicit);
+ MI.eraseFromParent();
- return loadM0(MI, MovRel, Offset);
+ if (LIS)
+ LIS->handleMove(*NewMI);
}
-// Return true if a new block was inserted.
-bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- const DebugLoc &DL = MI.getDebugLoc();
-
- unsigned Dst = MI.getOperand(0).getReg();
- int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
- unsigned Reg;
-
- const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
- std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset);
-
- MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
- if (Idx->getReg() == AMDGPU::NoRegister) {
- // Only had a constant offset, copy the register directly.
- BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg)
- .addOperand(*Val);
- MI.eraseFromParent();
- return false;
+// Returns replace operands for a logical operation, either single result
+// for exec or two operands if source was another equivalent operation.
+void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
+ SmallVectorImpl<MachineOperand> &Src) const {
+ MachineOperand &Op = MI.getOperand(OpNo);
+ if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+ Src.push_back(Op);
+ return;
}
- MachineInstr *MovRel =
- BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg)
- .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
- .addReg(Dst, RegState::Implicit);
+ MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
+ if (!Def || Def->getParent() != MI.getParent() ||
+ !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
+ return;
- return loadM0(MI, MovRel, Offset);
+ // Make sure we do not modify exec between def and use.
+ // A copy with implcitly defined exec inserted earlier is an exclusion, it
+ // does not really modify exec.
+ for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
+ if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
+ !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
+ return;
+
+ for (const auto &SrcOp : Def->explicit_operands())
+ if (SrcOp.isUse() && (!SrcOp.isReg() ||
+ TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
+ SrcOp.getReg() == AMDGPU::EXEC))
+ Src.push_back(SrcOp);
+}
+
+// Search and combine pairs of equivalent instructions, like
+// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
+// S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y
+// One of the operands is exec mask.
+void SILowerControlFlow::combineMasks(MachineInstr &MI) {
+ assert(MI.getNumExplicitOperands() == 3);
+ SmallVector<MachineOperand, 4> Ops;
+ unsigned OpToReplace = 1;
+ findMaskOperands(MI, 1, Ops);
+ if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
+ findMaskOperands(MI, 2, Ops);
+ if (Ops.size() != 3) return;
+
+ unsigned UniqueOpndIdx;
+ if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
+ else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+ else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+ else return;
+
+ unsigned Reg = MI.getOperand(OpToReplace).getReg();
+ MI.RemoveOperand(OpToReplace);
+ MI.addOperand(Ops[UniqueOpndIdx]);
+ if (MRI->use_empty(Reg))
+ MRI->getUniqueVRegDef(Reg)->eraseFromParent();
}
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
@@ -688,148 +402,66 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
- SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
- bool HaveKill = false;
- bool NeedFlat = false;
- unsigned Depth = 0;
+ // This doesn't actually need LiveIntervals, but we can preserve them.
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
+ MRI = &MF.getRegInfo();
MachineFunction::iterator NextBB;
-
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; BI = NextBB) {
NextBB = std::next(BI);
MachineBasicBlock &MBB = *BI;
- MachineBasicBlock *EmptyMBBAtEnd = nullptr;
- MachineBasicBlock::iterator I, Next;
- bool ExecModified = false;
+ MachineBasicBlock::iterator I, Next, Last;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
Next = std::next(I);
-
MachineInstr &MI = *I;
- // Flat uses m0 in case it needs to access LDS.
- if (TII->isFLAT(MI))
- NeedFlat = true;
-
- if (I->modifiesRegister(AMDGPU::EXEC, TRI))
- ExecModified = true;
-
switch (MI.getOpcode()) {
- default: break;
- case AMDGPU::SI_IF:
- ++Depth;
- If(MI);
- break;
-
- case AMDGPU::SI_ELSE:
- Else(MI, ExecModified);
- break;
-
- case AMDGPU::SI_BREAK:
- Break(MI);
- break;
-
- case AMDGPU::SI_IF_BREAK:
- IfBreak(MI);
- break;
-
- case AMDGPU::SI_ELSE_BREAK:
- ElseBreak(MI);
- break;
-
- case AMDGPU::SI_LOOP:
- ++Depth;
- Loop(MI);
- break;
-
- case AMDGPU::SI_END_CF:
- if (--Depth == 0 && HaveKill) {
- HaveKill = false;
- // TODO: Insert skip if exec is 0?
- }
-
- EndCf(MI);
- break;
-
- case AMDGPU::SI_KILL_TERMINATOR:
- if (Depth == 0) {
- if (skipIfDead(MI, *NextBB)) {
- NextBB = std::next(BI);
- BE = MF.end();
- }
- } else
- HaveKill = true;
- Kill(MI);
- break;
-
- case AMDGPU::S_BRANCH:
- Branch(MI);
- break;
-
- case AMDGPU::SI_INDIRECT_SRC_V1:
- case AMDGPU::SI_INDIRECT_SRC_V2:
- case AMDGPU::SI_INDIRECT_SRC_V4:
- case AMDGPU::SI_INDIRECT_SRC_V8:
- case AMDGPU::SI_INDIRECT_SRC_V16:
- if (indirectSrc(MI)) {
- // The block was split at this point. We can safely skip the middle
- // inserted block to the following which contains the rest of this
- // block's instructions.
- NextBB = std::next(BI);
- BE = MF.end();
- Next = MBB.end();
- }
-
- break;
-
- case AMDGPU::SI_INDIRECT_DST_V1:
- case AMDGPU::SI_INDIRECT_DST_V2:
- case AMDGPU::SI_INDIRECT_DST_V4:
- case AMDGPU::SI_INDIRECT_DST_V8:
- case AMDGPU::SI_INDIRECT_DST_V16:
- if (indirectDst(MI)) {
- // The block was split at this point. We can safely skip the middle
- // inserted block to the following which contains the rest of this
- // block's instructions.
- NextBB = std::next(BI);
- BE = MF.end();
- Next = MBB.end();
- }
-
- break;
-
- case AMDGPU::SI_RETURN: {
- assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
-
- // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
- // because external bytecode will be appended at the end.
- if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
- // SI_RETURN is not the last instruction. Add an empty block at
- // the end and jump there.
- if (!EmptyMBBAtEnd) {
- EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
- MF.insert(MF.end(), EmptyMBBAtEnd);
- }
-
- MBB.addSuccessor(EmptyMBBAtEnd);
- BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
- .addMBB(EmptyMBBAtEnd);
- I->eraseFromParent();
- }
- break;
- }
+ case AMDGPU::SI_IF:
+ emitIf(MI);
+ break;
+
+ case AMDGPU::SI_ELSE:
+ emitElse(MI);
+ break;
+
+ case AMDGPU::SI_BREAK:
+ emitBreak(MI);
+ break;
+
+ case AMDGPU::SI_IF_BREAK:
+ emitIfBreak(MI);
+ break;
+
+ case AMDGPU::SI_ELSE_BREAK:
+ emitElseBreak(MI);
+ break;
+
+ case AMDGPU::SI_LOOP:
+ emitLoop(MI);
+ break;
+
+ case AMDGPU::SI_END_CF:
+ emitEndCf(MI);
+ break;
+
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_OR_B64:
+ // Cleanup bit manipulations on exec mask
+ combineMasks(MI);
+ Last = I;
+ continue;
+
+ default:
+ Last = I;
+ continue;
}
- }
- }
- if (NeedFlat && MFI->IsKernel) {
- // TODO: What to use with function calls?
- // We will need to Initialize the flat scratch register pair.
- if (NeedFlat)
- MFI->setHasFlatInstructions(true);
+ // Replay newly inserted code to combine masks
+ Next = (Last == MBB.end()) ? MBB.begin() : Last;
+ }
}
return true;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index dc1d20d..be2e14f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -41,9 +41,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
- return "SI Lower i1 Copies";
- }
+ StringRef getPassName() const override { return "SI Lower i1 Copies"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -102,12 +100,12 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
+ DebugLoc DL = MI.getDebugLoc();
+ MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
if (DstRC == &AMDGPU::VReg_1RegClass &&
TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
I1Defs.push_back(Dst.getReg());
- DebugLoc DL = MI.getDebugLoc();
- MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
if (DefInst->getOperand(1).isImm()) {
I1Defs.push_back(Dst.getReg());
@@ -131,10 +129,26 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
MI.eraseFromParent();
} else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
SrcRC == &AMDGPU::VReg_1RegClass) {
- BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
- .addOperand(Dst)
- .addOperand(Src)
- .addImm(0);
+ if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
+ DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
+ DefInst->getOperand(1).getImm() == 0 &&
+ DefInst->getOperand(2).getImm() != 0 &&
+ DefInst->getOperand(3).isReg() &&
+ TargetRegisterInfo::isVirtualRegister(
+ DefInst->getOperand(3).getReg()) &&
+ TRI->getCommonSubClass(
+ MRI.getRegClass(DefInst->getOperand(3).getReg()),
+ &AMDGPU::SGPR_64RegClass)) {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
+ .addOperand(Dst)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(DefInst->getOperand(3));
+ } else {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
+ .addOperand(Dst)
+ .addOperand(Src)
+ .addImm(0);
+ }
MI.eraseFromParent();
}
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 848be32..ecd46b9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -26,9 +26,6 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
cl::ReallyHidden,
cl::init(true));
-// Pin the vtable to this file.
-void SIMachineFunctionInfo::anchor() {}
-
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
TIDReg(AMDGPU::NoRegister),
@@ -51,8 +48,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
PSInputAddr(0),
ReturnsVoid(true),
- MaximumWorkGroupSize(0),
- DebuggerReservedVGPRCount(0),
+ FlatWorkGroupSizes(0, 0),
+ WavesPerEU(0, 0),
DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),
DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),
LDSWaveSpillSize(0),
@@ -62,14 +59,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
HasSpilledSGPRs(false),
HasSpilledVGPRs(false),
HasNonSpillStackObjects(false),
- HasFlatInstructions(false),
NumSpilledSGPRs(0),
NumSpilledVGPRs(0),
PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
- DispatchID(false),
KernargSegmentPtr(false),
+ DispatchID(false),
FlatScratchInit(false),
GridWorkgroupCountX(false),
GridWorkgroupCountY(false),
@@ -81,13 +77,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
PrivateSegmentWaveByteOffset(false),
WorkItemIDX(false),
WorkItemIDY(false),
- WorkItemIDZ(false) {
+ WorkItemIDZ(false),
+ PrivateMemoryInputPtr(false) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const Function *F = MF.getFunction();
PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
- const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
if (!AMDGPU::isShader(F->getCallingConv())) {
KernargSegmentPtr = true;
@@ -113,12 +110,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkItemIDY = true;
bool MaySpill = ST.isVGPRSpillingEnabled(*F);
- bool HasStackObjects = FrameInfo->hasStackObjects();
+ bool HasStackObjects = FrameInfo.hasStackObjects();
if (HasStackObjects || MaySpill)
PrivateSegmentWaveByteOffset = true;
- if (ST.isAmdHsaOS()) {
+ if (ST.isAmdCodeObjectV2(MF)) {
if (HasStackObjects || MaySpill)
PrivateSegmentBuffer = true;
@@ -127,6 +124,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (F->hasFnAttribute("amdgpu-queue-ptr"))
QueuePtr = true;
+
+ if (F->hasFnAttribute("amdgpu-dispatch-id"))
+ DispatchID = true;
+ } else if (ST.isMesaGfxShader(MF)) {
+ if (HasStackObjects || MaySpill)
+ PrivateMemoryInputPtr = true;
}
// We don't need to worry about accessing spills with flat instructions.
@@ -136,13 +139,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
ST.isAmdHsaOS())
FlatScratchInit = true;
- if (AMDGPU::isCompute(F->getCallingConv()))
- MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F);
- else
- MaximumWorkGroupSize = ST.getWavefrontSize();
-
- if (ST.debuggerReserveRegs())
- DebuggerReservedVGPRCount = 4;
+ FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
+ WavesPerEU = ST.getWavesPerEU(*F);
}
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -174,6 +172,13 @@ unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI)
return KernargSegmentPtrUserSGPR;
}
+unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
+ DispatchIDUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return DispatchIDUserSGPR;
+}
+
unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
@@ -181,6 +186,13 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
return FlatScratchInitUserSGPR;
}
+unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
+ PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return PrivateMemoryPtrUserSGPR;
+}
+
SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
MachineFunction *MF,
unsigned FrameIndex,
@@ -191,9 +203,9 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
- int64_t Offset = FrameInfo->getObjectOffset(FrameIndex);
+ int64_t Offset = FrameInfo.getObjectOffset(FrameIndex);
Offset += SubIdx * 4;
unsigned LaneVGPRIdx = Offset / (64 * 4);
@@ -223,8 +235,3 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
return Spill;
}
-
-unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
- const MachineFunction &MF) const {
- return MaximumWorkGroupSize;
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index f5bd636..6fc8d18 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -23,12 +23,59 @@ namespace llvm {
class MachineRegisterInfo;
+class AMDGPUImagePseudoSourceValue : public PseudoSourceValue {
+public:
+ explicit AMDGPUImagePseudoSourceValue() :
+ PseudoSourceValue(PseudoSourceValue::TargetCustom) { }
+
+ bool isConstant(const MachineFrameInfo *) const override {
+ // This should probably be true for most images, but we will start by being
+ // conservative.
+ return false;
+ }
+
+ bool isAliased(const MachineFrameInfo *) const override {
+ // FIXME: If we ever change image intrinsics to accept fat pointers, then
+ // this could be true for some cases.
+ return false;
+ }
+
+ bool mayAlias(const MachineFrameInfo*) const override {
+ // FIXME: If we ever change image intrinsics to accept fat pointers, then
+ // this could be true for some cases.
+ return false;
+ }
+};
+
+class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue {
+public:
+ explicit AMDGPUBufferPseudoSourceValue() :
+ PseudoSourceValue(PseudoSourceValue::TargetCustom) { }
+
+ bool isConstant(const MachineFrameInfo *) const override {
+ // This should probably be true for most images, but we will start by being
+ // conservative.
+ return false;
+ }
+
+ bool isAliased(const MachineFrameInfo *) const override {
+ // FIXME: If we ever change image intrinsics to accept fat pointers, then
+ // this could be true for some cases.
+ return false;
+ }
+
+ bool mayAlias(const MachineFrameInfo*) const override {
+ // FIXME: If we ever change image intrinsics to accept fat pointers, then
+ // this could be true for some cases.
+ return false;
+ }
+};
+
/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
// FIXME: This should be removed and getPreloadedValue moved here.
- friend struct SIRegisterInfo;
- void anchor() override;
+ friend class SIRegisterInfo;
unsigned TIDReg;
@@ -37,6 +84,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
unsigned ScratchRSrcReg;
unsigned ScratchWaveOffsetReg;
+ // Input registers for non-HSA ABI
+ unsigned PrivateMemoryPtrUserSGPR;
+
// Input registers setup for the HSA ABI.
// User SGPRs in allocation order.
unsigned PrivateSegmentBufferUserSGPR;
@@ -61,15 +111,22 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
unsigned PSInputAddr;
bool ReturnsVoid;
- unsigned MaximumWorkGroupSize;
+ // A pair of default/requested minimum/maximum flat work group sizes.
+ // Minimum - first, maximum - second.
+ std::pair<unsigned, unsigned> FlatWorkGroupSizes;
+
+ // A pair of default/requested minimum/maximum number of waves per execution
+ // unit. Minimum - first, maximum - second.
+ std::pair<unsigned, unsigned> WavesPerEU;
- // Number of reserved VGPRs for debugger usage.
- unsigned DebuggerReservedVGPRCount;
// Stack object indices for work group IDs.
std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices;
// Stack object indices for work item IDs.
std::array<int, 3> DebuggerWorkItemIDStackObjectIndices;
+ AMDGPUBufferPseudoSourceValue BufferPSV;
+ AMDGPUImagePseudoSourceValue ImagePSV;
+
public:
// FIXME: Make private
unsigned LDSWaveSpillSize;
@@ -83,7 +140,6 @@ private:
bool HasSpilledSGPRs;
bool HasSpilledVGPRs;
bool HasNonSpillStackObjects;
- bool HasFlatInstructions;
unsigned NumSpilledSGPRs;
unsigned NumSpilledVGPRs;
@@ -92,8 +148,8 @@ private:
bool PrivateSegmentBuffer : 1;
bool DispatchPtr : 1;
bool QueuePtr : 1;
- bool DispatchID : 1;
bool KernargSegmentPtr : 1;
+ bool DispatchID : 1;
bool FlatScratchInit : 1;
bool GridWorkgroupCountX : 1;
bool GridWorkgroupCountY : 1;
@@ -110,6 +166,11 @@ private:
bool WorkItemIDY : 1;
bool WorkItemIDZ : 1;
+ // Private memory buffer
+ // Compute directly in sgpr[0:1]
+ // Other shaders indirect 64-bits at sgpr[0:1]
+ bool PrivateMemoryInputPtr : 1;
+
MCPhysReg getNextUserSGPR() const {
assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
return AMDGPU::SGPR0 + NumUserSGPRs;
@@ -143,7 +204,9 @@ public:
unsigned addDispatchPtr(const SIRegisterInfo &TRI);
unsigned addQueuePtr(const SIRegisterInfo &TRI);
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+ unsigned addDispatchID(const SIRegisterInfo &TRI);
unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
+ unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI);
// Add system SGPRs.
unsigned addWorkGroupIDX() {
@@ -192,14 +255,14 @@ public:
return QueuePtr;
}
- bool hasDispatchID() const {
- return DispatchID;
- }
-
bool hasKernargSegmentPtr() const {
return KernargSegmentPtr;
}
+ bool hasDispatchID() const {
+ return DispatchID;
+ }
+
bool hasFlatScratchInit() const {
return FlatScratchInit;
}
@@ -248,6 +311,10 @@ public:
return WorkItemIDZ;
}
+ bool hasPrivateMemoryInputPtr() const {
+ return PrivateMemoryInputPtr;
+ }
+
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
@@ -284,6 +351,10 @@ public:
return QueuePtrUserSGPR;
}
+ unsigned getPrivateMemoryPtrUserSGPR() const {
+ return PrivateMemoryPtrUserSGPR;
+ }
+
bool hasSpilledSGPRs() const {
return HasSpilledSGPRs;
}
@@ -308,14 +379,6 @@ public:
HasNonSpillStackObjects = StackObject;
}
- bool hasFlatInstructions() const {
- return HasFlatInstructions;
- }
-
- void setHasFlatInstructions(bool UseFlat = true) {
- HasFlatInstructions = UseFlat;
- }
-
unsigned getNumSpilledSGPRs() const {
return NumSpilledSGPRs;
}
@@ -352,9 +415,36 @@ public:
ReturnsVoid = Value;
}
- /// \returns Number of reserved VGPRs for debugger usage.
- unsigned getDebuggerReservedVGPRCount() const {
- return DebuggerReservedVGPRCount;
+ /// \returns A pair of default/requested minimum/maximum flat work group sizes
+ /// for this function.
+ std::pair<unsigned, unsigned> getFlatWorkGroupSizes() const {
+ return FlatWorkGroupSizes;
+ }
+
+ /// \returns Default/requested minimum flat work group size for this function.
+ unsigned getMinFlatWorkGroupSize() const {
+ return FlatWorkGroupSizes.first;
+ }
+
+ /// \returns Default/requested maximum flat work group size for this function.
+ unsigned getMaxFlatWorkGroupSize() const {
+ return FlatWorkGroupSizes.second;
+ }
+
+ /// \returns A pair of default/requested minimum/maximum number of waves per
+ /// execution unit.
+ std::pair<unsigned, unsigned> getWavesPerEU() const {
+ return WavesPerEU;
+ }
+
+ /// \returns Default/requested minimum number of waves per execution unit.
+ unsigned getMinWavesPerEU() const {
+ return WavesPerEU.first;
+ }
+
+ /// \returns Default/requested maximum number of waves per execution unit.
+ unsigned getMaxWavesPerEU() const {
+ return WavesPerEU.second;
}
/// \returns Stack object index for \p Dim's work group ID.
@@ -413,7 +503,13 @@ public:
llvm_unreachable("unexpected dimension");
}
- unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
+ const AMDGPUBufferPseudoSourceValue *getBufferPSV() const {
+ return &BufferPSV;
+ }
+
+ const AMDGPUImagePseudoSourceValue *getImagePSV() const {
+ return &ImagePSV;
+ }
};
} // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 7125b41..da86bbf 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -1,4 +1,4 @@
-//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===//
+//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -13,12 +13,28 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "SIInstrInfo.h"
#include "SIMachineScheduler.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
using namespace llvm;
@@ -77,11 +93,11 @@ using namespace llvm;
// The block creation algorithm is divided into several steps, and several
// variants can be tried during the scheduling process.
//
-// Second the order of the instructions inside the blocks is choosen.
+// Second the order of the instructions inside the blocks is chosen.
// At that step we do take into account only register usage and hiding
// low latency instructions
//
-// Third the block order is choosen, there we try to hide high latencies
+// Third the block order is chosen, there we try to hide high latencies
// and keep register usage low.
//
// After the third step, a pass is done to improve the hiding of low
@@ -89,7 +105,7 @@ using namespace llvm;
//
// Actually when talking about 'low latency' or 'high latency' it includes
// both the latency to get the cache (or global mem) data go to the register,
-// and the bandwith limitations.
+// and the bandwidth limitations.
// Increasing the number of active wavefronts helps hide the former, but it
// doesn't solve the latter, thus why even if wavefront count is high, we have
// to try have as many instructions hiding high latencies as possible.
@@ -120,7 +136,6 @@ using namespace llvm;
// 300-600 cycles. We do not specially take that into account when scheduling
// As we expect the driver to be able to preload the constants soon.
-
// common code //
#ifndef NDEBUG
@@ -181,7 +196,6 @@ void SIScheduleBlock::addUnit(SUnit *SU) {
}
#ifndef NDEBUG
-
void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
@@ -209,7 +223,7 @@ void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
// we haven't waited for
// . Low latencies
// . All other instructions
- // Goal is to get: low latency instructions - independant instructions
+ // Goal is to get: low latency instructions - independent instructions
// - (eventually some more low latency instructions)
// - instructions that depend on the first low latency instructions.
// If in the block there is a lot of constant loads, the SGPR usage
@@ -479,8 +493,7 @@ void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
void SIScheduleBlock::nodeScheduled(SUnit *SU) {
// Is in TopReadySUs
assert (!SU->NumPredsLeft);
- std::vector<SUnit*>::iterator I =
- std::find(TopReadySUs.begin(), TopReadySUs.end(), SU);
+ std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
if (I == TopReadySUs.end()) {
dbgs() << "Data Structure Bug in SI Scheduler\n";
llvm_unreachable(nullptr);
@@ -589,9 +602,8 @@ void SIScheduleBlock::printDebug(bool full) {
}
}
- dbgs() << "///////////////////////\n";
+ dbgs() << "///////////////////////\n";
}
-
#endif
// SIScheduleBlockCreator //
@@ -600,8 +612,7 @@ SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
DAG(DAG) {
}
-SIScheduleBlockCreator::~SIScheduleBlockCreator() {
-}
+SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
SIScheduleBlocks
SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
@@ -1059,8 +1070,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
unsigned Color = CurrentColoring[SU->NodeNum];
if (RealID.find(Color) == RealID.end()) {
int ID = CurrentBlocks.size();
- BlockPtrs.push_back(
- make_unique<SIScheduleBlock>(DAG, this, ID));
+ BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
RealID[Color] = ID;
}
@@ -1104,30 +1114,17 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
// Two functions taken from Codegen/MachineScheduler.cpp
-/// If this iterator is a debug value, increment until reaching the End or a
-/// non-debug instruction.
-static MachineBasicBlock::const_iterator
-nextIfDebug(MachineBasicBlock::const_iterator I,
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
MachineBasicBlock::const_iterator End) {
- for(; I != End; ++I) {
+ for (; I != End; ++I) {
if (!I->isDebugValue())
break;
}
return I;
}
-/// Non-const version.
-static MachineBasicBlock::iterator
-nextIfDebug(MachineBasicBlock::iterator I,
- MachineBasicBlock::const_iterator End) {
- // Cast the return value to nonconst MachineInstr, then cast to an
- // instr_iterator, which does not check for null, finally return a
- // bundle_iterator.
- return MachineBasicBlock::instr_iterator(
- const_cast<MachineInstr*>(
- &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
-}
-
void SIScheduleBlockCreator::topologicalSort() {
unsigned DAGSize = CurrentBlocks.size();
std::vector<int> WorkList;
@@ -1217,7 +1214,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
// Update LiveIntervals.
- // Note: Moving all instructions and calling handleMove everytime
+ // Note: Moving all instructions and calling handleMove every time
// is the most cpu intensive operation of the scheduler.
// It would gain a lot if there was a way to recompute the
// LiveIntervals for the entire scheduling region.
@@ -1265,7 +1262,7 @@ void SIScheduleBlockCreator::fillStats() {
for (unsigned i = 0, e = DAGSize; i != e; ++i) {
int BlockIndice = TopDownIndex2Block[i];
SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
- if (Block->getPreds().size() == 0)
+ if (Block->getPreds().empty())
Block->Depth = 0;
else {
unsigned Depth = 0;
@@ -1280,7 +1277,7 @@ void SIScheduleBlockCreator::fillStats() {
for (unsigned i = 0, e = DAGSize; i != e; ++i) {
int BlockIndice = BottomUpIndex2Block[i];
SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
- if (Block->getSuccs().size() == 0)
+ if (Block->getSuccs().empty())
Block->Height = 0;
else {
unsigned Height = 0;
@@ -1654,20 +1651,15 @@ SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
// SIScheduleDAGMI //
SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
- ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
+ ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
SITII = static_cast<const SIInstrInfo*>(TII);
SITRI = static_cast<const SIRegisterInfo*>(TRI);
- VGPRSetID = SITRI->getVGPR32PressureSet();
- SGPRSetID = SITRI->getSGPR32PressureSet();
-}
-
-SIScheduleDAGMI::~SIScheduleDAGMI() {
+ VGPRSetID = SITRI->getVGPRPressureSet();
+ SGPRSetID = SITRI->getSGPRPressureSet();
}
-ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) {
- return new SIScheduleDAGMI(C);
-}
+SIScheduleDAGMI::~SIScheduleDAGMI() = default;
// Code adapted from scheduleDAG.cpp
// Does a topological sort over the SUs.
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index 117aed4..77c0735 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -1,4 +1,4 @@
-//===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===//
+//===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -16,10 +16,16 @@
#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/RegisterPressure.h"
-
-using namespace llvm;
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include <cassert>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
namespace llvm {
@@ -93,12 +99,10 @@ class SIScheduleBlock {
public:
SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
unsigned ID):
- DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(),
- TopRPTracker(TopPressure), Scheduled(false),
- HighLatencyBlock(false), ID(ID),
- Preds(), Succs(), NumHighLatencySuccessors(0) {};
+ DAG(DAG), BC(BC), TopRPTracker(TopPressure), Scheduled(false),
+ HighLatencyBlock(false), ID(ID), NumHighLatencySuccessors(0) {}
- ~SIScheduleBlock() {};
+ ~SIScheduleBlock() = default;
unsigned getID() const { return ID; }
@@ -146,7 +150,6 @@ public:
bool isScheduled() { return Scheduled; }
-
// Needs the block to be scheduled inside
// TODO: find a way to compute it.
std::vector<unsigned> &getInternalAdditionnalRegUsage() {
@@ -161,7 +164,7 @@ public:
private:
struct SISchedCandidate : SISchedulerCandidate {
// The best SUnit candidate.
- SUnit *SU;
+ SUnit *SU = nullptr;
unsigned SGPRUsage;
unsigned VGPRUsage;
@@ -169,8 +172,7 @@ private:
unsigned LowLatencyOffset;
bool HasLowLatencyNonWaitedParent;
- SISchedCandidate()
- : SU(nullptr) {}
+ SISchedCandidate() = default;
bool isValid() const { return SU; }
@@ -341,17 +343,17 @@ public:
SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
SISchedulerBlockSchedulerVariant Variant,
SIScheduleBlocks BlocksStruct);
- ~SIScheduleBlockScheduler() {};
+ ~SIScheduleBlockScheduler() = default;
- std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; };
+ std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }
- unsigned getVGPRUsage() { return maxVregUsage; };
- unsigned getSGPRUsage() { return maxSregUsage; };
+ unsigned getVGPRUsage() { return maxVregUsage; }
+ unsigned getSGPRUsage() { return maxSregUsage; }
private:
struct SIBlockSchedCandidate : SISchedulerCandidate {
// The best Block candidate.
- SIScheduleBlock *Block;
+ SIScheduleBlock *Block = nullptr;
bool IsHighLatency;
int VGPRUsageDiff;
@@ -360,8 +362,7 @@ private:
unsigned LastPosHighLatParentScheduled;
unsigned Height;
- SIBlockSchedCandidate()
- : Block(nullptr) {}
+ SIBlockSchedCandidate() = default;
bool isValid() const { return Block; }
@@ -409,9 +410,9 @@ class SIScheduler {
SIScheduleBlockCreator BlockCreator;
public:
- SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {};
+ SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}
- ~SIScheduler() {};
+ ~SIScheduler() = default;
struct SIScheduleBlockResult
scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
@@ -445,13 +446,13 @@ public:
}
MachineBasicBlock *getBB() { return BB; }
- MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; };
- MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; };
+ MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
+ MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
LiveIntervals *getLIS() { return LIS; }
MachineRegisterInfo *getMRI() { return &MRI; }
const TargetRegisterInfo *getTRI() { return TRI; }
- SUnit& getEntrySU() { return EntrySU; };
- SUnit& getExitSU() { return ExitSU; };
+ SUnit& getEntrySU() { return EntrySU; }
+ SUnit& getExitSU() { return ExitSU; }
void restoreSULinksLeft();
@@ -459,13 +460,14 @@ public:
_Iterator End,
unsigned &VgprUsage,
unsigned &SgprUsage);
+
std::set<unsigned> getInRegs() {
std::set<unsigned> InRegs;
for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
InRegs.insert(RegMaskPair.RegUnit);
}
return InRegs;
- };
+ }
unsigned getVGPRSetID() const { return VGPRSetID; }
unsigned getSGPRSetID() const { return SGPRSetID; }
@@ -486,6 +488,6 @@ public:
std::vector<int> BottomUpIndex2SU;
};
-} // namespace llvm
+} // end namespace llvm
-#endif /* SIMACHINESCHEDULER_H_ */
+#endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
new file mode 100644
index 0000000..4d2f917
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -0,0 +1,304 @@
+//===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-optimize-exec-masking"
+
+namespace {
+
+class SIOptimizeExecMasking : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIOptimizeExecMasking() : MachineFunctionPass(ID) {
+ initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI optimize exec mask operations";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE,
+ "SI optimize exec mask operations", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE,
+ "SI optimize exec mask operations", false, false)
+
+char SIOptimizeExecMasking::ID = 0;
+
+char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
+
+/// If \p MI is a copy from exec, return the register copied to.
+static unsigned isCopyFromExec(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::COPY:
+ case AMDGPU::S_MOV_B64:
+ case AMDGPU::S_MOV_B64_term: {
+ const MachineOperand &Src = MI.getOperand(1);
+ if (Src.isReg() && Src.getReg() == AMDGPU::EXEC)
+ return MI.getOperand(0).getReg();
+ }
+ }
+
+ return AMDGPU::NoRegister;
+}
+
+/// If \p MI is a copy to exec, return the register copied from.
+static unsigned isCopyToExec(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::COPY:
+ case AMDGPU::S_MOV_B64: {
+ const MachineOperand &Dst = MI.getOperand(0);
+ if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC)
+ return MI.getOperand(1).getReg();
+ break;
+ }
+ case AMDGPU::S_MOV_B64_term:
+ llvm_unreachable("should have been replaced");
+ }
+
+ return AMDGPU::NoRegister;
+}
+
+static unsigned getSaveExecOp(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::S_AND_B64:
+ return AMDGPU::S_AND_SAVEEXEC_B64;
+ case AMDGPU::S_OR_B64:
+ return AMDGPU::S_OR_SAVEEXEC_B64;
+ case AMDGPU::S_XOR_B64:
+ return AMDGPU::S_XOR_SAVEEXEC_B64;
+ case AMDGPU::S_ANDN2_B64:
+ return AMDGPU::S_ANDN2_SAVEEXEC_B64;
+ case AMDGPU::S_ORN2_B64:
+ return AMDGPU::S_ORN2_SAVEEXEC_B64;
+ case AMDGPU::S_NAND_B64:
+ return AMDGPU::S_NAND_SAVEEXEC_B64;
+ case AMDGPU::S_NOR_B64:
+ return AMDGPU::S_NOR_SAVEEXEC_B64;
+ case AMDGPU::S_XNOR_B64:
+ return AMDGPU::S_XNOR_SAVEEXEC_B64;
+ default:
+ return AMDGPU::INSTRUCTION_LIST_END;
+ }
+}
+
+// These are only terminators to get correct spill code placement during
+// register allocation, so turn them back into normal instructions. Only one of
+// these is expected per block.
+static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_MOV_B64_term: {
+ MI.setDesc(TII.get(AMDGPU::COPY));
+ return true;
+ }
+ case AMDGPU::S_XOR_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
+ return true;
+ }
+ case AMDGPU::S_ANDN2_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64));
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+static MachineBasicBlock::reverse_iterator fixTerminators(
+ const SIInstrInfo &TII,
+ MachineBasicBlock &MBB) {
+ MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+ for (; I != E; ++I) {
+ if (!I->isTerminator())
+ return I;
+
+ if (removeTerminatorBit(TII, *I))
+ return I;
+ }
+
+ return E;
+}
+
+static MachineBasicBlock::reverse_iterator findExecCopy(
+ const SIInstrInfo &TII,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::reverse_iterator I,
+ unsigned CopyToExec) {
+ const unsigned InstLimit = 25;
+
+ auto E = MBB.rend();
+ for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
+ unsigned CopyFromExec = isCopyFromExec(*I);
+ if (CopyFromExec != AMDGPU::NoRegister)
+ return I;
+ }
+
+ return E;
+}
+
+// XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly
+// repor tthe register as unavailable because a super-register with a lane mask
+// as unavailable.
+static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
+ for (MachineBasicBlock *Succ : MBB.successors()) {
+ if (Succ->isLiveIn(Reg))
+ return true;
+ }
+
+ return false;
+}
+
+bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ // Optimize sequences emitted for control flow lowering. They are originally
+ // emitted as the separate operations because spill code may need to be
+ // inserted for the saved copy of exec.
+ //
+ // x = copy exec
+ // z = s_<op>_b64 x, y
+ // exec = copy z
+ // =>
+ // x = s_<op>_saveexec_b64 y
+ //
+
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB);
+ MachineBasicBlock::reverse_iterator E = MBB.rend();
+ if (I == E)
+ continue;
+
+ unsigned CopyToExec = isCopyToExec(*I);
+ if (CopyToExec == AMDGPU::NoRegister)
+ continue;
+
+ // Scan backwards to find the def.
+ auto CopyToExecInst = &*I;
+ auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec);
+ if (CopyFromExecInst == E)
+ continue;
+
+ if (isLiveOut(MBB, CopyToExec)) {
+ // The copied register is live out and has a second use in another block.
+ DEBUG(dbgs() << "Exec copy source register is live out\n");
+ continue;
+ }
+
+ unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
+ MachineInstr *SaveExecInst = nullptr;
+ SmallVector<MachineInstr *, 4> OtherUseInsts;
+
+ for (MachineBasicBlock::iterator J
+ = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
+ J != JE; ++J) {
+ if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) {
+ DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
+ // Make sure this is inserted after any VALU ops that may have been
+ // scheduled in between.
+ SaveExecInst = nullptr;
+ break;
+ }
+
+ if (J->modifiesRegister(CopyToExec, TRI)) {
+ if (SaveExecInst) {
+ DEBUG(dbgs() << "Multiple instructions modify "
+ << PrintReg(CopyToExec, TRI) << '\n');
+ SaveExecInst = nullptr;
+ break;
+ }
+
+ unsigned SaveExecOp = getSaveExecOp(J->getOpcode());
+ if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
+ break;
+
+ if (J->readsRegister(CopyFromExec, TRI)) {
+ SaveExecInst = &*J;
+ DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
+ continue;
+ } else {
+ DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n');
+ break;
+ }
+ }
+
+ if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
+ assert(SaveExecInst != &*J);
+ OtherUseInsts.push_back(&*J);
+ }
+ }
+
+ if (!SaveExecInst)
+ continue;
+
+ DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
+
+ MachineOperand &Src0 = SaveExecInst->getOperand(1);
+ MachineOperand &Src1 = SaveExecInst->getOperand(2);
+
+ MachineOperand *OtherOp = nullptr;
+
+ if (Src0.isReg() && Src0.getReg() == CopyFromExec) {
+ OtherOp = &Src1;
+ } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) {
+ if (!SaveExecInst->isCommutable())
+ break;
+
+ OtherOp = &Src0;
+ } else
+ llvm_unreachable("unexpected");
+
+ CopyFromExecInst->eraseFromParent();
+
+ auto InsPt = SaveExecInst->getIterator();
+ const DebugLoc &DL = SaveExecInst->getDebugLoc();
+
+ BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
+ CopyFromExec)
+ .addReg(OtherOp->getReg());
+ SaveExecInst->eraseFromParent();
+
+ CopyToExecInst->eraseFromParent();
+
+ for (MachineInstr *OtherInst : OtherUseInsts) {
+ OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC,
+ AMDGPU::NoSubRegister, *TRI);
+ }
+ }
+
+ return true;
+
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 347c33f..a1ed5e8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -24,52 +24,11 @@
using namespace llvm;
-static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) {
- const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- unsigned SIMDPerCU = 4;
-
- unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize();
- return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) /
- MaxInvocationsPerWave;
-}
-
-static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
-
- unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment;
- unsigned ReservedSGPRCount;
-
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
- TotalSGPRCountPerSIMD = 800;
- AddressableSGPRCount = 102;
- SGPRUsageAlignment = 16;
- ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK
- } else {
- TotalSGPRCountPerSIMD = 512;
- AddressableSGPRCount = 104;
- SGPRUsageAlignment = 8;
- ReservedSGPRCount = 2; // VCC
- }
+static cl::opt<bool> EnableSpillSGPRToSMEM(
+ "amdgpu-spill-sgpr-to-smem",
+ cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
+ cl::init(false));
- unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD);
- MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment);
-
- if (ST.hasSGPRInitBug())
- MaxSGPRCount = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
-
- return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount);
-}
-
-static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) {
- unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
- unsigned TotalVGPRCountPerSIMD = 256;
- unsigned VGPRUsageAlignment = 4;
-
- return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD,
- VGPRUsageAlignment);
-}
static bool hasPressureSet(const int *PSets, unsigned PSetID) {
for (unsigned i = 0; PSets[i] != -1; ++i) {
@@ -95,19 +54,38 @@ SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(),
VGPRPressureSets(getNumRegPressureSets()) {
unsigned NumRegPressureSets = getNumRegPressureSets();
- SGPR32SetID = NumRegPressureSets;
- VGPR32SetID = NumRegPressureSets;
- for (unsigned i = 0; i < NumRegPressureSets; ++i) {
- if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0)
- SGPR32SetID = i;
- else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0)
- VGPR32SetID = i;
+ SGPRSetID = NumRegPressureSets;
+ VGPRSetID = NumRegPressureSets;
+ for (unsigned i = 0; i < NumRegPressureSets; ++i) {
classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
}
- assert(SGPR32SetID < NumRegPressureSets &&
- VGPR32SetID < NumRegPressureSets);
+
+ // Determine the number of reg units for each pressure set.
+ std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0);
+ for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) {
+ const int *PSets = getRegUnitPressureSets(i);
+ for (unsigned j = 0; PSets[j] != -1; ++j) {
+ ++PressureSetRegUnits[PSets[j]];
+ }
+ }
+
+ unsigned VGPRMax = 0, SGPRMax = 0;
+ for (unsigned i = 0; i < NumRegPressureSets; ++i) {
+ if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) {
+ VGPRSetID = i;
+ VGPRMax = PressureSetRegUnits[i];
+ continue;
+ }
+ if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) {
+ SGPRSetID = i;
+ SGPRMax = PressureSetRegUnits[i];
+ }
+ }
+
+ assert(SGPRSetID < NumRegPressureSets &&
+ VGPRSetID < NumRegPressureSets);
}
void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
@@ -119,14 +97,14 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {
- unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4;
+ unsigned BaseIdx = alignDown(getMaxNumSGPRs(MF), 4) - 4;
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
}
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {
- unsigned RegCount = getMaxWorkGroupSGPRCount(MF);
+ unsigned RegCount = getMaxNumSGPRs(MF);
unsigned Reg;
// Try to place it in a hole after PrivateSegmentbufferReg.
@@ -161,18 +139,16 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
- unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF);
- unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF);
-
- unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
- unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
- for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) {
+ unsigned MaxNumSGPRs = getMaxNumSGPRs(MF);
+ unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+ for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);
}
-
- for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) {
+ unsigned MaxNumVGPRs = getMaxNumVGPRs(MF);
+ unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+ for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);
}
@@ -194,49 +170,26 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
}
- // Reserve registers for debugger usage if "amdgpu-debugger-reserve-trap-regs"
- // attribute was specified.
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- if (ST.debuggerReserveRegs()) {
- unsigned ReservedVGPRFirst =
- MaxWorkGroupVGPRCount - MFI->getDebuggerReservedVGPRCount();
- for (unsigned i = ReservedVGPRFirst; i < MaxWorkGroupVGPRCount; ++i) {
- unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
- reserveRegisterTuples(Reserved, Reg);
- }
- }
-
return Reserved;
}
-unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx) const {
- const SISubtarget &STI = MF.getSubtarget<SISubtarget>();
- // FIXME: We should adjust the max number of waves based on LDS size.
- unsigned SGPRLimit = getNumSGPRsAllowed(STI, STI.getMaxWavesPerCU());
- unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
-
- unsigned VSLimit = SGPRLimit + VGPRLimit;
-
- if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) {
- // FIXME: This is a hack. We should never be considering the pressure of
- // these since no virtual register should ever have this class.
- return VSLimit;
- }
-
- if (SGPRPressureSets.test(Idx))
- return SGPRLimit;
-
- return VGPRLimit;
-}
-
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
- return Fn.getFrameInfo()->hasStackObjects();
+ return Fn.getFrameInfo().hasStackObjects();
}
bool
SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
- return MF.getFrameInfo()->hasStackObjects();
+ return MF.getFrameInfo().hasStackObjects();
+}
+
+bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
+ const MachineFunction &MF) const {
+ // m0 is needed for the scalar store offset. m0 is unallocatable, so we can't
+ // create a virtual register for it during frame index elimination, so the
+ // scavenger is directly needed.
+ return MF.getFrameInfo().hasStackObjects() &&
+ MF.getSubtarget<SISubtarget>().hasScalarStores() &&
+ MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
}
bool SIRegisterInfo::requiresVirtualBaseRegisters(
@@ -250,6 +203,14 @@ bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const
return true;
}
+int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
+ assert(SIInstrInfo::isMUBUF(*MI));
+
+ int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::offset);
+ return MI->getOperand(OffIdx).getImm();
+}
+
int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
int Idx) const {
if (!SIInstrInfo::isMUBUF(*MI))
@@ -259,13 +220,16 @@ int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
AMDGPU::OpName::vaddr) &&
"Should never see frame index on non-address operand");
- int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::offset);
- return MI->getOperand(OffIdx).getImm();
+ return getMUBUFInstrOffset(MI);
}
bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
- return MI->mayLoadOrStore();
+ if (!MI->mayLoadOrStore())
+ return false;
+
+ int64_t FullOffset = Offset + getMUBUFInstrOffset(MI);
+
+ return !isUInt<12>(FullOffset);
}
void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
@@ -290,14 +254,19 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(Offset);
+ BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
+ .addFrameIndex(FrameIdx);
+
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg)
.addReg(UnusedCarry, RegState::Define | RegState::Dead)
.addReg(OffsetReg, RegState::Kill)
- .addFrameIndex(FrameIdx);
+ .addReg(FIReg);
}
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
@@ -328,40 +297,21 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
int64_t NewOffset = OffsetOp->getImm() + Offset;
- if (isUInt<12>(NewOffset)) {
- // If we have a legal offset, fold it directly into the instruction.
- FIOp->ChangeToRegister(BaseReg, false);
- OffsetOp->setImm(NewOffset);
- return;
- }
-
- // The offset is not legal, so we must insert an add of the offset.
- MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned NewReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- DebugLoc DL = MI.getDebugLoc();
-
- assert(Offset != 0 && "Non-zero offset expected");
-
- unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ assert(isUInt<12>(NewOffset) && "offset should be legal");
- // In the case the instruction already had an immediate offset, here only
- // the requested new offset is added because we are leaving the original
- // immediate in place.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addImm(Offset);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), NewReg)
- .addReg(UnusedCarry, RegState::Define | RegState::Dead)
- .addReg(OffsetReg, RegState::Kill)
- .addReg(BaseReg);
-
- FIOp->ChangeToRegister(NewReg, false);
+ FIOp->ChangeToRegister(BaseReg, false);
+ OffsetOp->setImm(NewOffset);
}
bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
unsigned BaseReg,
int64_t Offset) const {
- return SIInstrInfo::isMUBUF(*MI) && isUInt<12>(Offset);
+ if (!SIInstrInfo::isMUBUF(*MI))
+ return false;
+
+ int64_t NewOffset = Offset + getMUBUFInstrOffset(MI);
+
+ return isUInt<12>(NewOffset);
}
const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
@@ -407,31 +357,107 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
}
}
-void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
- unsigned LoadStoreOp,
- const MachineOperand *SrcDst,
- unsigned ScratchRsrcReg,
- unsigned ScratchOffset,
- int64_t Offset,
- RegScavenger *RS) const {
+static int getOffsetMUBUFStore(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+ return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
+ return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
+ case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
+ return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
+ default:
+ return -1;
+ }
+}
+
+static int getOffsetMUBUFLoad(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
+ return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
+ return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
+ case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
+ return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
+ case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
+ return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
+ case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
+ return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
+ case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
+ return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
+ case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
+ return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
+ default:
+ return -1;
+ }
+}
- unsigned Value = SrcDst->getReg();
- bool IsKill = SrcDst->isKill();
+// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
+// need to handle the case where an SGPR may need to be spilled while spilling.
+static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
+ MachineFrameInfo &MFI,
+ MachineBasicBlock::iterator MI,
+ int Index,
+ int64_t Offset) {
+ MachineBasicBlock *MBB = MI->getParent();
+ const DebugLoc &DL = MI->getDebugLoc();
+ bool IsStore = MI->mayStore();
+
+ unsigned Opc = MI->getOpcode();
+ int LoadStoreOp = IsStore ?
+ getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
+ if (LoadStoreOp == -1)
+ return false;
+
+ unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg();
+
+ BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
+ .addReg(Reg, getDefRegState(!IsStore))
+ .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
+ .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ return true;
+}
+
+void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
+ unsigned LoadStoreOp,
+ int Index,
+ unsigned ValueReg,
+ bool IsKill,
+ unsigned ScratchRsrcReg,
+ unsigned ScratchOffsetReg,
+ int64_t InstOffset,
+ MachineMemOperand *MMO,
+ RegScavenger *RS) const {
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
- DebugLoc DL = MI->getDebugLoc();
- bool IsStore = MI->mayStore();
+ const MCInstrDesc &Desc = TII->get(LoadStoreOp);
+ const DebugLoc &DL = MI->getDebugLoc();
+ bool IsStore = Desc.mayStore();
bool RanOutOfSGPRs = false;
bool Scavenged = false;
- unsigned SOffset = ScratchOffset;
- unsigned OriginalImmOffset = Offset;
+ unsigned SOffset = ScratchOffsetReg;
- unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
+ unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
unsigned Size = NumSubRegs * 4;
+ int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
+ const int64_t OriginalImmOffset = Offset;
+
+ unsigned Align = MFI.getObjectAlignment(Index);
+ const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
if (!isUInt<12>(Offset + Size)) {
SOffset = AMDGPU::NoRegister;
@@ -450,20 +476,23 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
// subtract the offset after the spill to return ScratchOffset to it's
// original value.
RanOutOfSGPRs = true;
- SOffset = ScratchOffset;
+ SOffset = ScratchOffsetReg;
} else {
Scavenged = true;
}
+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
- .addReg(ScratchOffset)
- .addImm(Offset);
+ .addReg(ScratchOffsetReg)
+ .addImm(Offset);
+
Offset = 0;
}
- for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
- unsigned SubReg = NumSubRegs > 1 ?
- getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
- Value;
+ const unsigned EltSize = 4;
+
+ for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
+ unsigned SubReg = NumSubRegs == 1 ?
+ ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
unsigned SOffsetRegState = 0;
unsigned SrcDstRegState = getDefRegState(!IsStore);
@@ -473,23 +502,324 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
SrcDstRegState |= getKillRegState(IsKill);
}
- BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
- .addReg(SubReg, getDefRegState(!IsStore))
+ MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
+ MachineMemOperand *NewMMO
+ = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
+ EltSize, MinAlign(Align, EltSize * i));
+
+ auto MIB = BuildMI(*MBB, MI, DL, Desc)
+ .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
.addReg(ScratchRsrcReg)
.addReg(SOffset, SOffsetRegState)
.addImm(Offset)
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // tfe
- .addReg(Value, RegState::Implicit | SrcDstRegState)
- .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ .addMemOperand(NewMMO);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
}
+
if (RanOutOfSGPRs) {
// Subtract the offset we added to the ScratchOffset register.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset)
- .addReg(ScratchOffset)
- .addImm(OriginalImmOffset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
+ .addReg(ScratchOffsetReg)
+ .addImm(OriginalImmOffset);
+ }
+}
+
+static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
+ bool Store) {
+ if (SuperRegSize % 16 == 0) {
+ return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
+ AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
+ }
+
+ if (SuperRegSize % 8 == 0) {
+ return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
+ AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
}
+
+ return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
+ AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
+}
+
+void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
+ int Index,
+ RegScavenger *RS) const {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ unsigned SuperReg = MI->getOperand(0).getReg();
+ bool IsKill = MI->getOperand(0).isKill();
+ const DebugLoc &DL = MI->getDebugLoc();
+
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+
+ bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+
+ assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+
+ unsigned OffsetReg = AMDGPU::M0;
+ unsigned M0CopyReg = AMDGPU::NoRegister;
+
+ if (SpillToSMEM) {
+ if (RS->isRegUsed(AMDGPU::M0)) {
+ M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
+ .addReg(AMDGPU::M0);
+ }
+ }
+
+ unsigned ScalarStoreOp;
+ unsigned EltSize = 4;
+ const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
+ if (SpillToSMEM && isSGPRClass(RC)) {
+ // XXX - if private_element_size is larger than 4 it might be useful to be
+ // able to spill wider vmem spills.
+ std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true);
+ }
+
+ ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
+ unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+
+ // SubReg carries the "Kill" flag when SubReg == SuperReg.
+ unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
+ for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+ unsigned SubReg = NumSubRegs == 1 ?
+ SuperReg : getSubReg(SuperReg, SplitParts[i]);
+
+ if (SpillToSMEM) {
+ int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+
+ // The allocated memory size is really the wavefront size * the frame
+ // index size. The widest register class is 64 bytes, so a 4-byte scratch
+ // allocation is enough to spill this in a single stack object.
+ //
+ // FIXME: Frame size/offsets are computed earlier than this, so the extra
+ // space is still unnecessarily allocated.
+
+ unsigned Align = FrameInfo.getObjectAlignment(Index);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+ EltSize, MinAlign(Align, EltSize * i));
+
+ // SMEM instructions only support a single offset, so increment the wave
+ // offset.
+
+ int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
+ if (Offset != 0) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg())
+ .addImm(Offset);
+ } else {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ }
+
+ BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
+ .addReg(SubReg, getKillRegState(IsKill)) // sdata
+ .addReg(MFI->getScratchRSrcReg()) // sbase
+ .addReg(OffsetReg, RegState::Kill) // soff
+ .addImm(0) // glc
+ .addMemOperand(MMO);
+
+ continue;
+ }
+
+ struct SIMachineFunctionInfo::SpilledReg Spill =
+ MFI->getSpilledReg(MF, Index, i);
+ if (Spill.hasReg()) {
+ BuildMI(*MBB, MI, DL,
+ TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+ Spill.VGPR)
+ .addReg(SubReg, getKillRegState(IsKill))
+ .addImm(Spill.Lane);
+
+ // FIXME: Since this spills to another register instead of an actual
+ // frame index, we should delete the frame index when all references to
+ // it are fixed.
+ } else {
+ // Spill SGPR to a frame index.
+ // TODO: Should VI try to spill to VGPR and then spill to SMEM?
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ // TODO: Should VI try to spill to VGPR and then spill to SMEM?
+
+ MachineInstrBuilder Mov
+ = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ .addReg(SubReg, SubKillState);
+
+
+ // There could be undef components of a spilled super register.
+ // TODO: Can we detect this and skip the spill?
+ if (NumSubRegs > 1) {
+ // The last implicit use of the SuperReg carries the "Kill" flag.
+ unsigned SuperKillState = 0;
+ if (i + 1 == e)
+ SuperKillState |= getKillRegState(IsKill);
+ Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
+ }
+
+ unsigned Align = FrameInfo.getObjectAlignment(Index);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+ EltSize, MinAlign(Align, EltSize * i));
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
+ .addReg(TmpReg, RegState::Kill) // src
+ .addFrameIndex(Index) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // srrsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // soffset
+ .addImm(i * 4) // offset
+ .addMemOperand(MMO);
+ }
+ }
+
+ if (M0CopyReg != AMDGPU::NoRegister) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(M0CopyReg, RegState::Kill);
+ }
+
+ MI->eraseFromParent();
+ MFI->addToSpilledSGPRs(NumSubRegs);
+}
+
+void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
+ int Index,
+ RegScavenger *RS) const {
+ MachineFunction *MF = MI->getParent()->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineBasicBlock *MBB = MI->getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const DebugLoc &DL = MI->getDebugLoc();
+
+ unsigned SuperReg = MI->getOperand(0).getReg();
+ bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+
+ assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+
+ unsigned OffsetReg = AMDGPU::M0;
+ unsigned M0CopyReg = AMDGPU::NoRegister;
+
+ if (SpillToSMEM) {
+ if (RS->isRegUsed(AMDGPU::M0)) {
+ M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
+ .addReg(AMDGPU::M0);
+ }
+ }
+
+ unsigned EltSize = 4;
+ unsigned ScalarLoadOp;
+
+ const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
+ if (SpillToSMEM && isSGPRClass(RC)) {
+ // XXX - if private_element_size is larger than 4 it might be useful to be
+ // able to spill wider vmem spills.
+ std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false);
+ }
+
+ ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
+ unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+
+ // SubReg carries the "Kill" flag when SubReg == SuperReg.
+ int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+
+ for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+ unsigned SubReg = NumSubRegs == 1 ?
+ SuperReg : getSubReg(SuperReg, SplitParts[i]);
+
+ if (SpillToSMEM) {
+ // FIXME: Size may be > 4 but extra bytes wasted.
+ unsigned Align = FrameInfo.getObjectAlignment(Index);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+ EltSize, MinAlign(Align, EltSize * i));
+
+ // Add i * 4 offset
+ int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
+ if (Offset != 0) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg())
+ .addImm(Offset);
+ } else {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ }
+
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
+ .addReg(MFI->getScratchRSrcReg()) // sbase
+ .addReg(OffsetReg, RegState::Kill) // soff
+ .addImm(0) // glc
+ .addMemOperand(MMO);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(SuperReg, RegState::ImplicitDefine);
+
+ continue;
+ }
+
+ SIMachineFunctionInfo::SpilledReg Spill
+ = MFI->getSpilledReg(MF, Index, i);
+
+ if (Spill.hasReg()) {
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ SubReg)
+ .addReg(Spill.VGPR)
+ .addImm(Spill.Lane);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(SuperReg, RegState::ImplicitDefine);
+ } else {
+ // Restore SGPR from a stack slot.
+ // FIXME: We should use S_LOAD_DWORD here for VI.
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned Align = FrameInfo.getObjectAlignment(Index);
+
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+
+ MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
+ MachineMemOperand::MOLoad, EltSize,
+ MinAlign(Align, EltSize * i));
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
+ .addFrameIndex(Index) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // srsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // soffset
+ .addImm(i * 4) // offset
+ .addMemOperand(MMO);
+
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
+ .addReg(TmpReg, RegState::Kill);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+ }
+ }
+
+ if (M0CopyReg != AMDGPU::NoRegister) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(M0CopyReg, RegState::Kill);
+ }
+
+ MI->eraseFromParent();
}
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
@@ -499,7 +829,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
@@ -514,66 +844,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE: {
- unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- unsigned SuperReg = MI->getOperand(0).getReg();
- bool IsKill = MI->getOperand(0).isKill();
- // SubReg carries the "Kill" flag when SubReg == SuperReg.
- unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
- for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- unsigned SubReg = getPhysRegSubReg(SuperReg,
- &AMDGPU::SGPR_32RegClass, i);
-
- struct SIMachineFunctionInfo::SpilledReg Spill =
- MFI->getSpilledReg(MF, Index, i);
-
- if (Spill.hasReg()) {
- BuildMI(*MBB, MI, DL,
- TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
- Spill.VGPR)
- .addReg(SubReg, getKillRegState(IsKill))
- .addImm(Spill.Lane);
-
- // FIXME: Since this spills to another register instead of an actual
- // frame index, we should delete the frame index when all references to
- // it are fixed.
- } else {
- // Spill SGPR to a frame index.
- // FIXME we should use S_STORE_DWORD here for VI.
- MachineInstrBuilder Mov
- = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
- .addReg(SubReg, SubKillState);
-
-
- // There could be undef components of a spilled super register.
- // TODO: Can we detect this and skip the spill?
- if (NumSubRegs > 1) {
- // The last implicit use of the SuperReg carries the "Kill" flag.
- unsigned SuperKillState = 0;
- if (i + 1 == e)
- SuperKillState |= getKillRegState(IsKill);
- Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
- }
-
- unsigned Size = FrameInfo->getObjectSize(Index);
- unsigned Align = FrameInfo->getObjectAlignment(Index);
- MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index);
- MachineMemOperand *MMO
- = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
- Size, Align);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
- .addReg(TmpReg, RegState::Kill) // src
- .addFrameIndex(Index) // frame_idx
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
- .addImm(i * 4) // offset
- .addMemOperand(MMO);
- }
- }
- MI->eraseFromParent();
- MFI->addToSpilledSGPRs(NumSubRegs);
+ spillSGPR(MI, Index, RS);
break;
}
@@ -583,49 +854,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_S128_RESTORE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE: {
- unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
- &AMDGPU::SGPR_32RegClass, i);
- struct SIMachineFunctionInfo::SpilledReg Spill =
- MFI->getSpilledReg(MF, Index, i);
-
- if (Spill.hasReg()) {
- BuildMI(*MBB, MI, DL,
- TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
- SubReg)
- .addReg(Spill.VGPR)
- .addImm(Spill.Lane)
- .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
- } else {
- // Restore SGPR from a stack slot.
- // FIXME: We should use S_LOAD_DWORD here for VI.
-
- unsigned Align = FrameInfo->getObjectAlignment(Index);
- unsigned Size = FrameInfo->getObjectSize(Index);
-
- MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index);
-
- MachineMemOperand *MMO = MF->getMachineMemOperand(
- PtrInfo, MachineMemOperand::MOLoad, Size, Align);
-
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
- .addFrameIndex(Index) // frame_idx
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
- .addImm(i * 4) // offset
- .addMemOperand(MMO);
- BuildMI(*MBB, MI, DL,
- TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
- .addReg(TmpReg, RegState::Kill)
- .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
- }
- }
-
- MI->eraseFromParent();
+ restoreSGPR(MI, Index, RS);
break;
}
@@ -635,34 +864,62 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V128_SAVE:
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V64_SAVE:
- case AMDGPU::SI_SPILL_V32_SAVE:
- buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
- TII->getNamedOperand(*MI, AMDGPU::OpName::src),
- TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
- TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
- FrameInfo->getObjectOffset(Index) +
- TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
- MI->eraseFromParent();
+ case AMDGPU::SI_SPILL_V32_SAVE: {
+ const MachineOperand *VData = TII->getNamedOperand(*MI,
+ AMDGPU::OpName::vdata);
+ buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+ Index,
+ VData->getReg(), VData->isKill(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(),
+ RS);
MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
+ MI->eraseFromParent();
break;
+ }
case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_V96_RESTORE:
case AMDGPU::SI_SPILL_V128_RESTORE:
case AMDGPU::SI_SPILL_V256_RESTORE:
case AMDGPU::SI_SPILL_V512_RESTORE: {
- buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
- TII->getNamedOperand(*MI, AMDGPU::OpName::dst),
- TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
- TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
- FrameInfo->getObjectOffset(Index) +
- TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
+ const MachineOperand *VData = TII->getNamedOperand(*MI,
+ AMDGPU::OpName::vdata);
+
+ buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+ Index,
+ VData->getReg(), VData->isKill(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(),
+ RS);
MI->eraseFromParent();
break;
}
default: {
- int64_t Offset = FrameInfo->getObjectOffset(Index);
+ if (TII->isMUBUF(*MI)) {
+ // Disable offen so we don't need a 0 vgpr base.
+ assert(static_cast<int>(FIOperandNum) ==
+ AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::vaddr));
+
+ int64_t Offset = FrameInfo.getObjectOffset(Index);
+ int64_t OldImm
+ = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
+ int64_t NewOffset = OldImm + Offset;
+
+ if (isUInt<12>(NewOffset) &&
+ buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
+ MI->eraseFromParent();
+ break;
+ }
+ }
+
+ int64_t Offset = FrameInfo.getObjectOffset(Index);
FIOp.ChangeToImmediate(Offset);
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -770,7 +1027,8 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
return RC;
// We can assume that each lane corresponds to one 32-bit register.
- unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx));
+ LaneBitmask::Type Mask = getSubRegIndexLaneMask(SubIdx).getAsInteger();
+ unsigned Count = countPopulation(Mask);
if (isSGPRClass(RC)) {
switch (Count) {
case 1:
@@ -812,7 +1070,7 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
// We want to prefer the smallest register class possible, so we don't want to
// stop and rewrite on anything that looks like a subregister
// extract. Operations mostly don't care about the super register class, so we
- // only want to stop on the most basic of copies between the smae register
+ // only want to stop on the most basic of copies between the same register
// class.
//
// e.g. if we have something like
@@ -828,80 +1086,6 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
return getCommonSubClass(DefRC, SrcRC) != nullptr;
}
-unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
- const TargetRegisterClass *SubRC,
- unsigned Channel) const {
-
- switch (Reg) {
- case AMDGPU::VCC:
- switch(Channel) {
- case 0: return AMDGPU::VCC_LO;
- case 1: return AMDGPU::VCC_HI;
- default: llvm_unreachable("Invalid SubIdx for VCC"); break;
- }
-
- case AMDGPU::TBA:
- switch(Channel) {
- case 0: return AMDGPU::TBA_LO;
- case 1: return AMDGPU::TBA_HI;
- default: llvm_unreachable("Invalid SubIdx for TBA"); break;
- }
-
- case AMDGPU::TMA:
- switch(Channel) {
- case 0: return AMDGPU::TMA_LO;
- case 1: return AMDGPU::TMA_HI;
- default: llvm_unreachable("Invalid SubIdx for TMA"); break;
- }
-
- case AMDGPU::FLAT_SCR:
- switch (Channel) {
- case 0:
- return AMDGPU::FLAT_SCR_LO;
- case 1:
- return AMDGPU::FLAT_SCR_HI;
- default:
- llvm_unreachable("Invalid SubIdx for FLAT_SCR");
- }
- break;
-
- case AMDGPU::EXEC:
- switch (Channel) {
- case 0:
- return AMDGPU::EXEC_LO;
- case 1:
- return AMDGPU::EXEC_HI;
- default:
- llvm_unreachable("Invalid SubIdx for EXEC");
- }
- break;
- }
-
- const TargetRegisterClass *RC = getPhysRegClass(Reg);
- // 32-bit registers don't have sub-registers, so we can just return the
- // Reg. We need to have this check here, because the calculation below
- // using getHWRegIndex() will fail with special 32-bit registers like
- // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0.
- if (RC->getSize() == 4) {
- assert(Channel == 0);
- return Reg;
- }
-
- unsigned Index = getHWRegIndex(Reg);
- return SubRC->getRegister(Index + Channel);
-}
-
-bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
- return OpType == AMDGPU::OPERAND_REG_IMM32;
-}
-
-bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
- if (opCanUseLiteralConstant(OpType))
- return true;
-
- return OpType == AMDGPU::OPERAND_REG_INLINE_C;
-}
-
// FIXME: Most of these are flexible with HSA and we don't need to reserve them
// as input registers if unused. Whether the dispatch ptr is necessary should be
// easy to detect from used intrinsics. Scratch setup is harder to know.
@@ -924,14 +1108,18 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
- assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations");
- assert(MFI->hasPrivateSegmentBuffer());
- return MFI->PrivateSegmentBufferUserSGPR;
+ if (ST.isAmdCodeObjectV2(MF)) {
+ assert(MFI->hasPrivateSegmentBuffer());
+ return MFI->PrivateSegmentBufferUserSGPR;
+ }
+ assert(MFI->hasPrivateMemoryInputPtr());
+ return MFI->PrivateMemoryPtrUserSGPR;
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
assert(MFI->hasKernargSegmentPtr());
return MFI->KernargSegmentPtrUserSGPR;
case SIRegisterInfo::DISPATCH_ID:
- llvm_unreachable("unimplemented");
+ assert(MFI->hasDispatchID());
+ return MFI->DispatchIDUserSGPR;
case SIRegisterInfo::FLAT_SCRATCH_INIT:
assert(MFI->hasFlatScratchInit());
return MFI->FlatScratchInitUserSGPR;
@@ -968,50 +1156,323 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
return AMDGPU::NoRegister;
}
-unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
- switch(WaveCount) {
- case 10: return 24;
- case 9: return 28;
- case 8: return 32;
- case 7: return 36;
- case 6: return 40;
- case 5: return 48;
- case 4: return 64;
- case 3: return 84;
- case 2: return 128;
- default: return 256;
+unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return 800;
+ return 512;
+}
+
+unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return 102;
+ return 104;
+}
+
+unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST,
+ const SIMachineFunctionInfo &MFI) const {
+ if (MFI.hasFlatScratchInit()) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return 6; // FLAT_SCRATCH, XNACK, VCC (in that order)
+
+ if (ST.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
+ return 4; // FLAT_SCRATCH, VCC (in that order)
}
+
+ if (ST.isXNACKEnabled())
+ return 4; // XNACK, VCC (in that order)
+
+ return 2; // VCC.
}
-unsigned SIRegisterInfo::getNumSGPRsAllowed(const SISubtarget &ST,
- unsigned WaveCount) const {
- if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
- switch (WaveCount) {
+unsigned SIRegisterInfo::getMinNumSGPRs(const SISubtarget &ST,
+ unsigned WavesPerEU) const {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ switch (WavesPerEU) {
+ case 0: return 0;
+ case 10: return 0;
+ case 9: return 0;
+ case 8: return 81;
+ default: return 97;
+ }
+ } else {
+ switch (WavesPerEU) {
+ case 0: return 0;
+ case 10: return 0;
+ case 9: return 49;
+ case 8: return 57;
+ case 7: return 65;
+ case 6: return 73;
+ case 5: return 81;
+ default: return 97;
+ }
+ }
+}
+
+unsigned SIRegisterInfo::getMaxNumSGPRs(const SISubtarget &ST,
+ unsigned WavesPerEU,
+ bool Addressable) const {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ switch (WavesPerEU) {
+ case 0: return 80;
case 10: return 80;
case 9: return 80;
case 8: return 96;
- default: return 102;
+ default: return Addressable ? getNumAddressableSGPRs(ST) : 112;
}
} else {
- switch(WaveCount) {
+ switch (WavesPerEU) {
+ case 0: return 48;
case 10: return 48;
case 9: return 56;
case 8: return 64;
case 7: return 72;
case 6: return 80;
case 5: return 96;
- default: return 103;
+ default: return getNumAddressableSGPRs(ST);
}
}
}
-bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
- unsigned Reg) const {
- const TargetRegisterClass *RC;
+unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const {
+ const Function &F = *MF.getFunction();
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+ // Compute maximum number of SGPRs function can use using default/requested
+ // minimum number of waves per execution unit.
+ std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
+ unsigned MaxNumSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, false);
+ unsigned MaxNumAddressableSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, true);
+
+ // Check if maximum number of SGPRs was explicitly requested using
+ // "amdgpu-num-sgpr" attribute.
+ if (F.hasFnAttribute("amdgpu-num-sgpr")) {
+ unsigned Requested = AMDGPU::getIntegerAttribute(
+ F, "amdgpu-num-sgpr", MaxNumSGPRs);
+
+ // Make sure requested value does not violate subtarget's specifications.
+ if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI)))
+ Requested = 0;
+
+ // If more SGPRs are required to support the input user/system SGPRs,
+ // increase to accommodate them.
+ //
+ // FIXME: This really ends up using the requested number of SGPRs + number
+ // of reserved special registers in total. Theoretically you could re-use
+ // the last input registers for these special registers, but this would
+ // require a lot of complexity to deal with the weird aliasing.
+ unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs();
+ if (Requested && Requested < NumInputSGPRs)
+ Requested = NumInputSGPRs;
+
+ // Make sure requested value is compatible with values implied by
+ // default/requested minimum/maximum number of waves per execution unit.
+ if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first, false))
+ Requested = 0;
+ if (WavesPerEU.second &&
+ Requested && Requested < getMinNumSGPRs(ST, WavesPerEU.second))
+ Requested = 0;
+
+ if (Requested)
+ MaxNumSGPRs = Requested;
+ }
+
+ if (ST.hasSGPRInitBug())
+ MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+
+ return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI),
+ MaxNumAddressableSGPRs);
+}
+
+unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs(
+ const SISubtarget &ST) const {
+ if (ST.debuggerReserveRegs())
+ return 4;
+ return 0;
+}
+
+unsigned SIRegisterInfo::getMinNumVGPRs(unsigned WavesPerEU) const {
+ switch (WavesPerEU) {
+ case 0: return 0;
+ case 10: return 0;
+ case 9: return 25;
+ case 8: return 29;
+ case 7: return 33;
+ case 6: return 37;
+ case 5: return 41;
+ case 4: return 49;
+ case 3: return 65;
+ case 2: return 85;
+ default: return 129;
+ }
+}
+
+unsigned SIRegisterInfo::getMaxNumVGPRs(unsigned WavesPerEU) const {
+ switch (WavesPerEU) {
+ case 0: return 24;
+ case 10: return 24;
+ case 9: return 28;
+ case 8: return 32;
+ case 7: return 36;
+ case 6: return 40;
+ case 5: return 48;
+ case 4: return 64;
+ case 3: return 84;
+ case 2: return 128;
+ default: return getTotalNumVGPRs();
+ }
+}
+
+unsigned SIRegisterInfo::getMaxNumVGPRs(const MachineFunction &MF) const {
+ const Function &F = *MF.getFunction();
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+ // Compute maximum number of VGPRs function can use using default/requested
+ // minimum number of waves per execution unit.
+ std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
+ unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
+
+ // Check if maximum number of VGPRs was explicitly requested using
+ // "amdgpu-num-vgpr" attribute.
+ if (F.hasFnAttribute("amdgpu-num-vgpr")) {
+ unsigned Requested = AMDGPU::getIntegerAttribute(
+ F, "amdgpu-num-vgpr", MaxNumVGPRs);
+
+ // Make sure requested value does not violate subtarget's specifications.
+ if (Requested && Requested <= getNumDebuggerReservedVGPRs(ST))
+ Requested = 0;
+
+ // Make sure requested value is compatible with values implied by
+ // default/requested minimum/maximum number of waves per execution unit.
+ if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
+ Requested = 0;
+ if (WavesPerEU.second &&
+ Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
+ Requested = 0;
+
+ if (Requested)
+ MaxNumVGPRs = Requested;
+ }
+
+ return MaxNumVGPRs - getNumDebuggerReservedVGPRs(ST);
+}
+
+ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
+ unsigned EltSize) const {
+ if (EltSize == 4) {
+ static const int16_t Sub0_15[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+ AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+ AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+ };
+
+ static const int16_t Sub0_7[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+ };
+
+ static const int16_t Sub0_3[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ };
+
+ static const int16_t Sub0_2[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
+ };
+
+ static const int16_t Sub0_1[] = {
+ AMDGPU::sub0, AMDGPU::sub1,
+ };
+
+ switch (AMDGPU::getRegBitWidth(*RC->MC)) {
+ case 32:
+ return {};
+ case 64:
+ return makeArrayRef(Sub0_1);
+ case 96:
+ return makeArrayRef(Sub0_2);
+ case 128:
+ return makeArrayRef(Sub0_3);
+ case 256:
+ return makeArrayRef(Sub0_7);
+ case 512:
+ return makeArrayRef(Sub0_15);
+ default:
+ llvm_unreachable("unhandled register size");
+ }
+ }
+
+ if (EltSize == 8) {
+ static const int16_t Sub0_15_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+ AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
+ AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
+ AMDGPU::sub12_sub13, AMDGPU::sub14_sub15
+ };
+
+ static const int16_t Sub0_7_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+ AMDGPU::sub4_sub5, AMDGPU::sub6_sub7
+ };
+
+
+ static const int16_t Sub0_3_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3
+ };
+
+ switch (AMDGPU::getRegBitWidth(*RC->MC)) {
+ case 64:
+ return {};
+ case 128:
+ return makeArrayRef(Sub0_3_64);
+ case 256:
+ return makeArrayRef(Sub0_7_64);
+ case 512:
+ return makeArrayRef(Sub0_15_64);
+ default:
+ llvm_unreachable("unhandled register size");
+ }
+ }
+
+ assert(EltSize == 16 && "unhandled register spill split size");
+
+ static const int16_t Sub0_15_128[] = {
+ AMDGPU::sub0_sub1_sub2_sub3,
+ AMDGPU::sub4_sub5_sub6_sub7,
+ AMDGPU::sub8_sub9_sub10_sub11,
+ AMDGPU::sub12_sub13_sub14_sub15
+ };
+
+ static const int16_t Sub0_7_128[] = {
+ AMDGPU::sub0_sub1_sub2_sub3,
+ AMDGPU::sub4_sub5_sub6_sub7
+ };
+
+ switch (AMDGPU::getRegBitWidth(*RC->MC)) {
+ case 128:
+ return {};
+ case 256:
+ return makeArrayRef(Sub0_7_128);
+ case 512:
+ return makeArrayRef(Sub0_15_128);
+ default:
+ llvm_unreachable("unhandled register size");
+ }
+}
+
+const TargetRegisterClass*
+SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
+ unsigned Reg) const {
if (TargetRegisterInfo::isVirtualRegister(Reg))
- RC = MRI.getRegClass(Reg);
- else
- RC = getPhysRegClass(Reg);
+ return MRI.getRegClass(Reg);
- return hasVGPRs(RC);
+ return getPhysRegClass(Reg);
+}
+
+bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
+ unsigned Reg) const {
+ return hasVGPRs(getRegClassForReg(MRI, Reg));
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index d8b2d9f..0bcae7d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -16,17 +16,19 @@
#define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
#include "AMDGPURegisterInfo.h"
+#include "SIDefines.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
namespace llvm {
class SISubtarget;
class MachineRegisterInfo;
+class SIMachineFunctionInfo;
-struct SIRegisterInfo final : public AMDGPURegisterInfo {
+class SIRegisterInfo final : public AMDGPURegisterInfo {
private:
- unsigned SGPR32SetID;
- unsigned VGPR32SetID;
+ unsigned SGPRSetID;
+ unsigned VGPRSetID;
BitVector SGPRPressureSets;
BitVector VGPRPressureSets;
@@ -48,17 +50,16 @@ public:
BitVector getReservedRegs(const MachineFunction &MF) const override;
- unsigned getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx) const override;
-
-
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
-
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+ bool requiresFrameIndexReplacementScavenging(
+ const MachineFunction &MF) const override;
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override;
bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+ int64_t getMUBUFInstrOffset(const MachineInstr *MI) const;
+
int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
int Idx) const override;
@@ -77,6 +78,12 @@ public:
const TargetRegisterClass *getPointerRegClass(
const MachineFunction &MF, unsigned Kind = 0) const override;
+ void spillSGPR(MachineBasicBlock::iterator MI,
+ int FI, RegScavenger *RS) const;
+
+ void restoreSGPR(MachineBasicBlock::iterator MI,
+ int FI, RegScavenger *RS) const;
+
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS) const override;
@@ -111,13 +118,6 @@ public:
/// \returns true if this class contains VGPR registers.
bool hasVGPRs(const TargetRegisterClass *RC) const;
- /// returns true if this is a pseudoregister class combination of VGPRs and
- /// SGPRs for operand modeling. FIXME: We should set isAllocatable = 0 on
- /// them.
- static bool isPseudoRegClass(const TargetRegisterClass *RC) {
- return RC == &AMDGPU::VS_32RegClass || RC == &AMDGPU::VS_64RegClass;
- }
-
/// \returns A VGPR reg class with the same width as \p SRC
const TargetRegisterClass *getEquivalentVGPRClass(
const TargetRegisterClass *SRC) const;
@@ -137,20 +137,21 @@ public:
const TargetRegisterClass *SrcRC,
unsigned SrcSubReg) const override;
- /// \p Channel This is the register channel (e.g. a value from 0-16), not the
- /// SubReg index.
- /// \returns The sub-register of Reg that is in Channel.
- unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
- unsigned Channel) const;
-
/// \returns True if operands defined with this operand type can accept
/// a literal constant (i.e. any 32-bit immediate).
- bool opCanUseLiteralConstant(unsigned OpType) const;
+ bool opCanUseLiteralConstant(unsigned OpType) const {
+ // TODO: 64-bit operands have extending behavior from 32-bit literal.
+ return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
+ OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
+ }
/// \returns True if operands defined with this operand type can accept
/// an inline constant. i.e. An integer value in the range (-16, 64) or
/// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
- bool opCanUseInlineConstant(unsigned OpType) const;
+ bool opCanUseInlineConstant(unsigned OpType) const {
+ return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
+ OpType <= AMDGPU::OPERAND_SRC_LAST;
+ }
enum PreloadedValue {
// SGPRS:
@@ -176,29 +177,104 @@ public:
unsigned getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const;
- /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount
- /// concurrent waves.
- unsigned getNumVGPRsAllowed(unsigned WaveCount) const;
-
- /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
- /// concurrent waves.
- unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const;
-
unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC,
const MachineFunction &MF) const;
- unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
- unsigned getVGPR32PressureSet() const { return VGPR32SetID; };
+ unsigned getSGPRPressureSet() const { return SGPRSetID; };
+ unsigned getVGPRPressureSet() const { return VGPRSetID; };
+ const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI,
+ unsigned Reg) const;
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
+ bool isSGPRPressureSet(unsigned SetID) const {
+ return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID);
+ }
+ bool isVGPRPressureSet(unsigned SetID) const {
+ return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID);
+ }
+
+ /// \returns SGPR allocation granularity supported by the subtarget.
+ unsigned getSGPRAllocGranule() const {
+ return 8;
+ }
+
+ /// \returns Total number of SGPRs supported by the subtarget.
+ unsigned getTotalNumSGPRs(const SISubtarget &ST) const;
+
+ /// \returns Number of addressable SGPRs supported by the subtarget.
+ unsigned getNumAddressableSGPRs(const SISubtarget &ST) const;
+
+ /// \returns Number of reserved SGPRs supported by the subtarget.
+ unsigned getNumReservedSGPRs(const SISubtarget &ST,
+ const SIMachineFunctionInfo &MFI) const;
+
+ /// \returns Minimum number of SGPRs that meets given number of waves per
+ /// execution unit requirement for given subtarget.
+ unsigned getMinNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const;
+
+ /// \returns Maximum number of SGPRs that meets given number of waves per
+ /// execution unit requirement for given subtarget.
+ unsigned getMaxNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU,
+ bool Addressable) const;
+
+ /// \returns Maximum number of SGPRs that meets number of waves per execution
+ /// unit requirement for function \p MF, or number of SGPRs explicitly
+ /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per execution
+ /// unit requirement.
+ unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+
+ /// \returns VGPR allocation granularity supported by the subtarget.
+ unsigned getVGPRAllocGranule() const {
+ return 4;
+ }
+
+ /// \returns Total number of VGPRs supported by the subtarget.
+ unsigned getTotalNumVGPRs() const {
+ return 256;
+ }
+
+ /// \returns Number of reserved VGPRs for debugger use supported by the
+ /// subtarget.
+ unsigned getNumDebuggerReservedVGPRs(const SISubtarget &ST) const;
+
+ /// \returns Minimum number of SGPRs that meets given number of waves per
+ /// execution unit requirement.
+ unsigned getMinNumVGPRs(unsigned WavesPerEU) const;
+
+ /// \returns Maximum number of VGPRs that meets given number of waves per
+ /// execution unit requirement.
+ unsigned getMaxNumVGPRs(unsigned WavesPerEU) const;
+
+ /// \returns Maximum number of VGPRs that meets number of waves per execution
+ /// unit requirement for function \p MF, or number of VGPRs explicitly
+ /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per execution
+ /// unit requirement.
+ unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+
+ ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
+ unsigned EltSize) const;
+
private:
- void buildScratchLoadStore(MachineBasicBlock::iterator MI,
- unsigned LoadStoreOp, const MachineOperand *SrcDst,
- unsigned ScratchRsrcReg, unsigned ScratchOffset,
- int64_t Offset,
- RegScavenger *RS) const;
+ void buildSpillLoadStore(MachineBasicBlock::iterator MI,
+ unsigned LoadStoreOp,
+ int Index,
+ unsigned ValueReg,
+ bool ValueIsKill,
+ unsigned ScratchRsrcReg,
+ unsigned ScratchOffsetReg,
+ int64_t InstrOffset,
+ MachineMemOperand *MMO,
+ RegScavenger *RS) const;
};
} // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c427874..31e714b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -120,12 +120,19 @@ def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
let isAllocatable = 0;
}
+def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
+ let CopyCost = 1;
+ let isAllocatable = 0;
+}
+
// TODO: Do we need to set DwarfRegAlias on register tuples?
// SGPR 32-bit registers
-def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
(add (sequence "SGPR%u", 0, 103))> {
- let AllocationPriority = 1;
+ // Give all SGPR classes higher priority than VGPR classes, because
+ // we want to spill SGPRs to VGPRs.
+ let AllocationPriority = 7;
}
// SGPR 64-bit registers
@@ -190,9 +197,10 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
(add (decimate (shl TTMP_32, 3), 4))]>;
// VGPR 32-bit registers
-def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
(add (sequence "VGPR%u", 0, 255))> {
let AllocationPriority = 1;
+ let Size = 32;
}
// VGPR 64-bit registers
@@ -248,43 +256,51 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
// Register classes used as source and destination
//===----------------------------------------------------------------------===//
-class RegImmMatcher<string name> : AsmOperandClass {
- let Name = name;
- let RenderMethod = "addRegOrImmOperands";
-}
-
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
-def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32,
- (add SGPR_32, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
+def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+ (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> {
- let AllocationPriority = 1;
+ let AllocationPriority = 7;
+}
+
+def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+ (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
+ let AllocationPriority = 7;
}
// Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
- (add SReg_32_XM0, M0)> {
- let AllocationPriority = 1;
+def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+ (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> {
+ let AllocationPriority = 7;
}
def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
- let AllocationPriority = 2;
+ let CopyCost = 1;
+ let AllocationPriority = 8;
}
def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
let isAllocatable = 0;
}
+def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+ (add SGPR_64, VCC, FLAT_SCR, TTMP_64, TBA, TMA)> {
+ let CopyCost = 1;
+ let AllocationPriority = 8;
+}
+
def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
- (add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64, TBA, TMA)> {
- let AllocationPriority = 2;
+ (add SReg_64_XEXEC, EXEC)> {
+ let CopyCost = 1;
+ let AllocationPriority = 8;
}
// Requires 2 s_mov_b64 to copy
let CopyCost = 2 in {
def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> {
- let AllocationPriority = 4;
+ let AllocationPriority = 10;
}
def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> {
@@ -292,7 +308,7 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R
}
def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> {
- let AllocationPriority = 4;
+ let AllocationPriority = 10;
}
} // End CopyCost = 2
@@ -300,17 +316,19 @@ def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128,
def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> {
// Requires 4 s_mov_b64 to copy
let CopyCost = 4;
- let AllocationPriority = 5;
+ let AllocationPriority = 11;
}
def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
// Requires 8 s_mov_b64 to copy
let CopyCost = 8;
- let AllocationPriority = 6;
+ let AllocationPriority = 12;
}
// Register class for all vector registers (VGPRs + Interploation Registers)
def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
+ let Size = 64;
+
// Requires 2 v_mov_b32 to copy
let CopyCost = 2;
let AllocationPriority = 2;
@@ -325,17 +343,21 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
}
def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
+ let Size = 128;
+
// Requires 4 v_mov_b32 to copy
let CopyCost = 4;
let AllocationPriority = 4;
}
def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> {
+ let Size = 256;
let CopyCost = 8;
let AllocationPriority = 5;
}
def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
+ let Size = 512;
let CopyCost = 16;
let AllocationPriority = 6;
}
@@ -344,80 +366,100 @@ def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
let Size = 32;
}
-class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> {
- let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_REG_IMM32";
+def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+ (add VGPR_32, SReg_32)> {
+ let isAllocatable = 0;
}
-class RegInlineOperand <RegisterClass rc> : RegisterOperand<rc> {
- let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_REG_INLINE_C";
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
+ let isAllocatable = 0;
}
//===----------------------------------------------------------------------===//
-// SSrc_* Operands with an SGPR or a 32-bit immediate
+// Register operands
//===----------------------------------------------------------------------===//
-def SSrc_32 : RegImmOperand<SReg_32> {
- let ParserMatchClass = RegImmMatcher<"SSrc32">;
+class RegImmMatcher<string name> : AsmOperandClass {
+ let Name = name;
+ let RenderMethod = "addRegOrImmOperands";
}
-def SSrc_64 : RegImmOperand<SReg_64> {
- let ParserMatchClass = RegImmMatcher<"SSrc64">;
+multiclass SIRegOperand <string rc, string MatchName, string opType> {
+ let OperandNamespace = "AMDGPU" in {
+ def _b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ let OperandType = opType#"_INT16";
+ let ParserMatchClass = RegImmMatcher<MatchName#"B16">;
+ let DecoderMethod = "decodeOperand_VSrc16";
+ }
+
+ def _f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ let OperandType = opType#"_FP16";
+ let ParserMatchClass = RegImmMatcher<MatchName#"F16">;
+ let DecoderMethod = "decodeOperand_VSrc16";
+ }
+
+ def _b32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ let OperandType = opType#"_INT32";
+ let ParserMatchClass = RegImmMatcher<MatchName#"B32">;
+ }
+
+ def _f32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ let OperandType = opType#"_FP32";
+ let ParserMatchClass = RegImmMatcher<MatchName#"F32">;
+ }
+
+ def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+ let OperandType = opType#"_INT64";
+ let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
+ }
+
+ def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+ let OperandType = opType#"_FP64";
+ let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
+ }
+ }
}
+// FIXME: 64-bit sources can sometimes use 32-bit constants.
+multiclass RegImmOperand <string rc, string MatchName>
+ : SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">;
+
+multiclass RegInlineOperand <string rc, string MatchName>
+ : SIRegOperand<rc, MatchName, "OPERAND_REG_INLINE_C">;
+
//===----------------------------------------------------------------------===//
-// SCSrc_* Operands with an SGPR or a inline constant
+// SSrc_* Operands with an SGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
-def SCSrc_32 : RegInlineOperand<SReg_32> {
- let ParserMatchClass = RegImmMatcher<"SCSrc32">;
-}
+defm SSrc : RegImmOperand<"SReg", "SSrc">;
//===----------------------------------------------------------------------===//
-// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
+// SCSrc_* Operands with an SGPR or a inline constant
//===----------------------------------------------------------------------===//
-def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>;
+defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
- let CopyCost = 2;
-}
+//===----------------------------------------------------------------------===//
+// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
+//===----------------------------------------------------------------------===//
-def VSrc_32 : RegisterOperand<VS_32> {
- let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_REG_IMM32";
- let ParserMatchClass = RegImmMatcher<"VSrc32">;
-}
+defm VSrc : RegImmOperand<"VS", "VSrc">;
-def VSrc_64 : RegisterOperand<VS_64> {
- let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_REG_IMM32";
- let ParserMatchClass = RegImmMatcher<"VSrc64">;
-}
+def VSrc_128 : RegisterOperand<VReg_128>;
//===----------------------------------------------------------------------===//
-// VCSrc_* Operands with an SGPR, VGPR or an inline constant
+// VSrc_* Operands with an VGPR
//===----------------------------------------------------------------------===//
-def VCSrc_32 : RegisterOperand<VS_32> {
- let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_REG_INLINE_C";
- let ParserMatchClass = RegImmMatcher<"VCSrc32">;
-}
-
-def VCSrc_64 : RegisterOperand<VS_64> {
- let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_REG_INLINE_C";
- let ParserMatchClass = RegImmMatcher<"VCSrc64">;
+// This is for operands with the enum(9), VSrc encoding restriction,
+// but only allows VGPRs.
+def VRegSrc_32 : RegisterOperand<VGPR_32> {
+ //let ParserMatchClass = RegImmMatcher<"VRegSrc32">;
+ let DecoderMethod = "DecodeVS_32RegisterClass";
}
//===----------------------------------------------------------------------===//
-// SCSrc_* Operands with an SGPR or an inline constant
+// VCSrc_* Operands with an SGPR, VGPR or an inline constant
//===----------------------------------------------------------------------===//
-def SCSrc_64 : RegisterOperand<SReg_64> {
- let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_REG_INLINE_C";
- let ParserMatchClass = RegImmMatcher<"SCSrc64">;
-}
+defm VCSrc : RegInlineOperand<"VS", "VCSrc">;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
index ed19217..be27966 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -46,7 +46,11 @@ def Write64Bit : SchedWrite;
// instructions)
class SISchedMachineModel : SchedMachineModel {
- let CompleteModel = 0;
+ let CompleteModel = 1;
+ // MicroOpBufferSize = 1 means that instructions will always be added
+ // the ready queue when they become available. This exposes them
+ // to the register pressure analysis.
+ let MicroOpBufferSize = 1;
let IssueWidth = 1;
let PostRAScheduler = 1;
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 6cba553..dd31dc6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -45,9 +45,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
- return "SI Shrink Instructions";
- }
+ StringRef getPassName() const override { return "SI Shrink Instructions"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -86,13 +84,19 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
// FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
// a special case for it. It can only be shrunk if the third operand
// is vcc. We should handle this the same way we handle vopc, by addding
- // a register allocation hint pre-regalloc and then do the shrining
+ // a register allocation hint pre-regalloc and then do the shrinking
// post-regalloc.
if (Src2) {
switch (MI.getOpcode()) {
default: return false;
+ case AMDGPU::V_ADDC_U32_e64:
+ case AMDGPU::V_SUBB_U32_e64:
+ // Additional verification is needed for sdst/src2.
+ return true;
+
case AMDGPU::V_MAC_F32_e64:
+ case AMDGPU::V_MAC_F16_e64:
if (!isVGPR(Src2, TRI, MRI) ||
TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
return false;
@@ -134,23 +138,15 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
- MachineOperand &Src0 = MI.getOperand(Src0Idx);
// Only one literal constant is allowed per instruction, so if src0 is a
// literal constant then we can't do any folding.
- if (Src0.isImm() &&
- TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
- return;
-
- // Literal constants and SGPRs can only be used in Src0, so if Src0 is an
- // SGPR, we cannot commute the instruction, so we can't fold any literal
- // constants.
- if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI))
+ if (TII->isLiteralConstant(MI, Src0Idx))
return;
// Try to fold Src0
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
unsigned Reg = Src0.getReg();
MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
@@ -158,7 +154,8 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
MachineOperand &MovSrc = Def->getOperand(1);
bool ConstantFolded = false;
- if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) {
+ if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
+ isUInt<32>(MovSrc.getImm()))) {
Src0.ChangeToImmediate(MovSrc.getImm());
ConstantFolded = true;
}
@@ -182,7 +179,7 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
const MachineOperand &Orig) {
for (MachineOperand &Use : MI.implicit_operands()) {
- if (Use.getReg() == AMDGPU::VCC) {
+ if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
Use.setIsUndef(Orig.isUndef());
Use.setIsKill(Orig.isKill());
return;
@@ -191,7 +188,95 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
}
static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
- return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
+ return isInt<16>(Src.getImm()) &&
+ !TII->isInlineConstant(*Src.getParent(),
+ Src.getParent()->getOperandNo(&Src));
+}
+
+static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+ return isUInt<16>(Src.getImm()) &&
+ !TII->isInlineConstant(*Src.getParent(),
+ Src.getParent()->getOperandNo(&Src));
+}
+
+static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
+ const MachineOperand &Src,
+ bool &IsUnsigned) {
+ if (isInt<16>(Src.getImm())) {
+ IsUnsigned = false;
+ return !TII->isInlineConstant(Src);
+ }
+
+ if (isUInt<16>(Src.getImm())) {
+ IsUnsigned = true;
+ return !TII->isInlineConstant(Src);
+ }
+
+ return false;
+}
+
+/// \returns true if the constant in \p Src should be replaced with a bitreverse
+/// of an inline immediate.
+static bool isReverseInlineImm(const SIInstrInfo *TII,
+ const MachineOperand &Src,
+ int32_t &ReverseImm) {
+ if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
+ return false;
+
+ ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
+ return ReverseImm >= -16 && ReverseImm <= 64;
+}
+
+/// Copy implicit register operands from specified instruction to this
+/// instruction that are not part of the instruction definition.
+static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
+ const MachineInstr &MI) {
+ for (unsigned i = MI.getDesc().getNumOperands() +
+ MI.getDesc().getNumImplicitUses() +
+ MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
+ i != e; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
+ NewMI.addOperand(MF, MO);
+ }
+}
+
+static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
+ // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
+ // get constants on the RHS.
+ if (!MI.getOperand(0).isReg())
+ TII->commuteInstruction(MI, false, 0, 1);
+
+ const MachineOperand &Src1 = MI.getOperand(1);
+ if (!Src1.isImm())
+ return;
+
+ int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
+ if (SOPKOpc == -1)
+ return;
+
+ // eq/ne is special because the imm16 can be treated as signed or unsigned,
+ // and initially selectd to the unsigned versions.
+ if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
+ bool HasUImm;
+ if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
+ if (!HasUImm) {
+ SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
+ AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
+ }
+
+ MI.setDesc(TII->get(SOPKOpc));
+ }
+
+ return;
+ }
+
+ const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
+
+ if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
+ (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
+ MI.setDesc(NewDesc);
+ }
}
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
@@ -226,14 +311,11 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineOperand &Src = MI.getOperand(1);
if (Src.isImm() &&
TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
- int64_t Imm = Src.getImm();
- if (isInt<32>(Imm) && !TII->isInlineConstant(Src, 4)) {
- int32_t ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Imm));
- if (ReverseImm >= -16 && ReverseImm <= 64) {
- MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
- Src.setImm(ReverseImm);
- continue;
- }
+ int32_t ReverseImm;
+ if (isReverseInlineImm(TII, Src, ReverseImm)) {
+ MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
+ Src.setImm(ReverseImm);
+ continue;
}
}
}
@@ -272,21 +354,27 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// satisfied.
if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
MI.getOpcode() == AMDGPU::S_MUL_I32) {
- const MachineOperand &Dest = MI.getOperand(0);
- const MachineOperand &Src0 = MI.getOperand(1);
- const MachineOperand &Src1 = MI.getOperand(2);
+ const MachineOperand *Dest = &MI.getOperand(0);
+ MachineOperand *Src0 = &MI.getOperand(1);
+ MachineOperand *Src1 = &MI.getOperand(2);
+
+ if (!Src0->isReg() && Src1->isReg()) {
+ if (TII->commuteInstruction(MI, false, 1, 2))
+ std::swap(Src0, Src1);
+ }
// FIXME: This could work better if hints worked with subregisters. If
// we have a vector add of a constant, we usually don't get the correct
// allocation due to the subregister usage.
- if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) &&
- Src0.isReg()) {
- MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg());
+ if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
+ Src0->isReg()) {
+ MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
+ MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
continue;
}
- if (Src0.isReg() && Src0.getReg() == Dest.getReg()) {
- if (Src1.isImm() && isKImmOperand(TII, Src1)) {
+ if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
+ if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
@@ -296,12 +384,27 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
}
}
+ // Try to use s_cmpk_*
+ if (MI.isCompare() && TII->isSOPC(MI)) {
+ shrinkScalarCompare(TII, MI);
+ continue;
+ }
+
// Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
- const MachineOperand &Src = MI.getOperand(1);
+ const MachineOperand &Dst = MI.getOperand(0);
+ MachineOperand &Src = MI.getOperand(1);
- if (Src.isImm() && isKImmOperand(TII, Src))
- MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
+ if (Src.isImm() &&
+ TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
+ int32_t ReverseImm;
+ if (isKImmOperand(TII, Src))
+ MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
+ else if (isReverseInlineImm(TII, Src, ReverseImm)) {
+ MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
+ Src.setImm(ReverseImm);
+ }
+ }
continue;
}
@@ -358,6 +461,31 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
}
+ // Check for the bool flag output for instructions like V_ADD_I32_e64.
+ const MachineOperand *SDst = TII->getNamedOperand(MI,
+ AMDGPU::OpName::sdst);
+
+ // Check the carry-in operand for v_addc_u32_e64.
+ const MachineOperand *Src2 = TII->getNamedOperand(MI,
+ AMDGPU::OpName::src2);
+
+ if (SDst) {
+ if (SDst->getReg() != AMDGPU::VCC) {
+ if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
+ MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
+ continue;
+ }
+
+ // All of the instructions with carry outs also have an SGPR input in
+ // src2.
+ if (Src2 && Src2->getReg() != AMDGPU::VCC) {
+ if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
+ MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
+
+ continue;
+ }
+ }
+
// We can shrink this instruction
DEBUG(dbgs() << "Shrinking " << MI);
@@ -383,8 +511,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (Src1)
Inst32.addOperand(*Src1);
- const MachineOperand *Src2 =
- TII->getNamedOperand(MI, AMDGPU::OpName::src2);
if (Src2) {
int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
if (Op32Src2Idx != -1) {
@@ -398,9 +524,13 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
}
++NumInstructionsShrunk;
- MI.eraseFromParent();
+ // Copy extra operands not present in the instruction definition.
+ copyExtraImplicitOps(*Inst32, MF, MI);
+
+ MI.eraseFromParent();
foldImmediates(*Inst32, TII, MRI);
+
DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
index facc0c7..aad6853 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
@@ -42,9 +42,7 @@ public:
SITypeRewriter() : FunctionPass(ID) { }
bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;
- const char *getPassName() const override {
- return "SI Type Rewriter";
- }
+ StringRef getPassName() const override { return "SI Type Rewriter"; }
void visitLoadInst(LoadInst &I);
void visitCallInst(CallInst &I);
void visitBitCast(BitCastInst &I);
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 1534d58..a613a22 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -53,10 +53,28 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <vector>
using namespace llvm;
@@ -69,6 +87,25 @@ enum {
StateExact = 0x2,
};
+struct PrintState {
+public:
+ int State;
+
+ explicit PrintState(int State) : State(State) {}
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
+ if (PS.State & StateWQM)
+ OS << "WQM";
+ if (PS.State & StateExact) {
+ if (PS.State & StateWQM)
+ OS << '|';
+ OS << "Exact";
+ }
+
+ return OS;
+}
+
struct InstrInfo {
char Needs = 0;
char OutNeeds = 0;
@@ -84,7 +121,7 @@ struct WorkItem {
MachineBasicBlock *MBB = nullptr;
MachineInstr *MI = nullptr;
- WorkItem() {}
+ WorkItem() = default;
WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
WorkItem(MachineInstr *MI) : MI(MI) {}
};
@@ -98,16 +135,26 @@ private:
DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
- SmallVector<const MachineInstr *, 2> ExecExports;
SmallVector<MachineInstr *, 1> LiveMaskQueries;
+ void printInfo();
+
void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
+ void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
char analyzeFunction(MachineFunction &MF);
+ bool requiresCorrectState(const MachineInstr &MI) const;
+
+ MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before);
+ MachineBasicBlock::iterator
+ prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
+ MachineBasicBlock::iterator Last, bool PreferLast,
+ bool SaveSCC);
void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SaveWQM, unsigned LiveMaskReg);
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
@@ -124,9 +171,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
- return "SI Whole Quad Mode";
- }
+ StringRef getPassName() const override { return "SI Whole Quad Mode"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveIntervals>();
@@ -135,7 +180,7 @@ public:
}
};
-} // End anonymous namespace
+} // end anonymous namespace
char SIWholeQuadMode::ID = 0;
@@ -151,6 +196,24 @@ FunctionPass *llvm::createSIWholeQuadModePass() {
return new SIWholeQuadMode;
}
+void SIWholeQuadMode::printInfo() {
+ for (const auto &BII : Blocks) {
+ dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
+ << " InNeeds = " << PrintState(BII.second.InNeeds)
+ << ", Needs = " << PrintState(BII.second.Needs)
+ << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
+
+ for (const MachineInstr &MI : *BII.first) {
+ auto III = Instructions.find(&MI);
+ if (III == Instructions.end())
+ continue;
+
+ dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
+ << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+ }
+ }
+}
+
void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
InstrInfo &II = Instructions[&MI];
@@ -168,6 +231,45 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
Worklist.push_back(&MI);
}
+/// Mark all instructions defining the uses in \p MI as WQM.
+void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
+ std::vector<WorkItem> &Worklist) {
+ for (const MachineOperand &Use : MI.uses()) {
+ if (!Use.isReg() || !Use.isUse())
+ continue;
+
+ unsigned Reg = Use.getReg();
+
+ // Handle physical registers that we need to track; this is mostly relevant
+ // for VCC, which can appear as the (implicit) input of a uniform branch,
+ // e.g. when a loop counter is stored in a VGPR.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Reg == AMDGPU::EXEC)
+ continue;
+
+ for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+ LiveRange &LR = LIS->getRegUnit(*RegUnit);
+ const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+ if (!Value)
+ continue;
+
+ // Since we're in machine SSA, we do not need to track physical
+ // registers across basic blocks.
+ if (Value->isPHIDef())
+ continue;
+
+ markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+ Worklist);
+ }
+
+ continue;
+ }
+
+ for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+ markInstruction(DefMI, StateWQM, Worklist);
+ }
+}
+
// Scan instructions to determine which ones require an Exact execmask and
// which ones seed WQM requirements.
char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
@@ -183,16 +285,19 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
unsigned Opcode = MI.getOpcode();
char Flags = 0;
- if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
+ if (TII->isDS(Opcode)) {
Flags = StateWQM;
+ } else if (TII->isWQM(Opcode)) {
+ // Sampling instructions don't need to produce results for all pixels
+ // in a quad, they just require all inputs of a quad to have been
+ // computed for derivatives.
+ markUsesWQM(MI, Worklist);
+ GlobalFlags |= StateWQM;
+ continue;
} else if (TII->isDisableWQM(MI)) {
Flags = StateExact;
} else {
- // Handle export instructions with the exec mask valid flag set
- if (Opcode == AMDGPU::EXP) {
- if (MI.getOperand(4).getImm() != 0)
- ExecExports.push_back(&MI);
- } else if (Opcode == AMDGPU::SI_PS_LIVE) {
+ if (Opcode == AMDGPU::SI_PS_LIVE) {
LiveMaskQueries.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
@@ -259,43 +364,9 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
// Propagate WQM flag to instruction inputs
assert(II.Needs != (StateWQM | StateExact));
- if (II.Needs != StateWQM)
- return;
-
- for (const MachineOperand &Use : MI.uses()) {
- if (!Use.isReg() || !Use.isUse())
- continue;
-
- unsigned Reg = Use.getReg();
-
- // Handle physical registers that we need to track; this is mostly relevant
- // for VCC, which can appear as the (implicit) input of a uniform branch,
- // e.g. when a loop counter is stored in a VGPR.
- if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
- if (Reg == AMDGPU::EXEC)
- continue;
- for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
- LiveRange &LR = LIS->getRegUnit(*RegUnit);
- const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
- if (!Value)
- continue;
-
- // Since we're in machine SSA, we do not need to track physical
- // registers across basic blocks.
- if (Value->isPHIDef())
- continue;
-
- markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
- Worklist);
- }
-
- continue;
- }
-
- for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
- markInstruction(DefMI, StateWQM, Worklist);
- }
+ if (II.Needs == StateWQM)
+ markUsesWQM(MI, Worklist);
}
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -351,32 +422,140 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
return GlobalFlags;
}
+/// Whether \p MI really requires the exec state computed during analysis.
+///
+/// Scalar instructions must occasionally be marked WQM for correct propagation
+/// (e.g. thread masks leading up to branches), but when it comes to actual
+/// execution, they don't care about EXEC.
+bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
+ if (MI.isTerminator())
+ return true;
+
+ // Skip instructions that are not affected by EXEC
+ if (TII->isScalarUnit(MI))
+ return false;
+
+ // Generic instructions such as COPY will either disappear by register
+ // coalescing or be lowered to SALU or VALU instructions.
+ if (MI.isTransient()) {
+ if (MI.getNumExplicitOperands() >= 1) {
+ const MachineOperand &Op = MI.getOperand(0);
+ if (Op.isReg()) {
+ if (TRI->isSGPRReg(*MRI, Op.getReg())) {
+ // SGPR instructions are not affected by EXEC
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+MachineBasicBlock::iterator
+SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before) {
+ unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ MachineInstr *Save =
+ BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
+ .addReg(AMDGPU::SCC);
+ MachineInstr *Restore =
+ BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
+ .addReg(SaveReg);
+
+ LIS->InsertMachineInstrInMaps(*Save);
+ LIS->InsertMachineInstrInMaps(*Restore);
+ LIS->createAndComputeVirtRegInterval(SaveReg);
+
+ return Restore;
+}
+
+// Return an iterator in the (inclusive) range [First, Last] at which
+// instructions can be safely inserted, keeping in mind that some of the
+// instructions we want to add necessarily clobber SCC.
+MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
+ MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
+ if (!SaveSCC)
+ return PreferLast ? Last : First;
+
+ LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+ auto MBBE = MBB.end();
+ SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
+ : LIS->getMBBEndIdx(&MBB);
+ SlotIndex LastIdx =
+ Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
+ SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
+ const LiveRange::Segment *S;
+
+ for (;;) {
+ S = LR.getSegmentContaining(Idx);
+ if (!S)
+ break;
+
+ if (PreferLast) {
+ SlotIndex Next = S->start.getBaseIndex();
+ if (Next < FirstIdx)
+ break;
+ Idx = Next;
+ } else {
+ SlotIndex Next = S->end.getNextIndex().getBaseIndex();
+ if (Next > LastIdx)
+ break;
+ Idx = Next;
+ }
+ }
+
+ MachineBasicBlock::iterator MBBI;
+
+ if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
+ MBBI = MI;
+ else {
+ assert(Idx == LIS->getMBBEndIdx(&MBB));
+ MBBI = MBB.end();
+ }
+
+ if (S)
+ MBBI = saveSCC(MBB, MBBI);
+
+ return MBBI;
+}
+
void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
unsigned SaveWQM, unsigned LiveMaskReg) {
+ MachineInstr *MI;
+
if (SaveWQM) {
- BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
- SaveWQM)
- .addReg(LiveMaskReg);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+ SaveWQM)
+ .addReg(LiveMaskReg);
} else {
- BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
- AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(LiveMaskReg);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
+ AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(LiveMaskReg);
}
+
+ LIS->InsertMachineInstrInMaps(*MI);
}
void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
unsigned SavedWQM) {
+ MachineInstr *MI;
+
if (SavedWQM) {
- BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
- .addReg(SavedWQM);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+ .addReg(SavedWQM);
} else {
- BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
- AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+ AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC);
}
+
+ LIS->InsertMachineInstrInMaps(*MI);
}
void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
@@ -395,72 +574,82 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
return;
+ DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
+
unsigned SavedWQMReg = 0;
bool WQMFromExec = isEntry;
char State = isEntry ? StateExact : StateWQM;
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
- while (II != IE) {
- MachineInstr &MI = *II;
- ++II;
+ if (isEntry)
+ ++II; // Skip the instruction that saves LiveMask
- // Skip instructions that are not affected by EXEC
- if (TII->isScalarUnit(MI) && !MI.isTerminator())
- continue;
+ MachineBasicBlock::iterator First = IE;
+ for (;;) {
+ MachineBasicBlock::iterator Next = II;
+ char Needs = 0;
+ char OutNeeds = 0;
- // Generic instructions such as COPY will either disappear by register
- // coalescing or be lowered to SALU or VALU instructions.
- if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) {
- if (MI.getNumExplicitOperands() >= 1) {
- const MachineOperand &Op = MI.getOperand(0);
- if (Op.isReg()) {
- if (TRI->isSGPRReg(*MRI, Op.getReg())) {
- // SGPR instructions are not affected by EXEC
- continue;
- }
+ if (First == IE)
+ First = II;
+
+ if (II != IE) {
+ MachineInstr &MI = *II;
+
+ if (requiresCorrectState(MI)) {
+ auto III = Instructions.find(&MI);
+ if (III != Instructions.end()) {
+ Needs = III->second.Needs;
+ OutNeeds = III->second.OutNeeds;
}
}
- }
- char Needs = 0;
- char OutNeeds = 0;
- auto InstrInfoIt = Instructions.find(&MI);
- if (InstrInfoIt != Instructions.end()) {
- Needs = InstrInfoIt->second.Needs;
- OutNeeds = InstrInfoIt->second.OutNeeds;
-
- // Make sure to switch to Exact mode before the end of the block when
- // Exact and only Exact is needed further downstream.
- if (OutNeeds == StateExact && MI.isTerminator()) {
- assert(Needs == 0);
+ if (MI.isTerminator() && !Needs && OutNeeds == StateExact)
+ Needs = StateExact;
+
+ if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
+ MI.getOperand(3).setImm(1);
+
+ ++Next;
+ } else {
+ // End of basic block
+ if (BI.OutNeeds & StateWQM)
+ Needs = StateWQM;
+ else if (BI.OutNeeds == StateExact)
Needs = StateExact;
- }
}
- // State switching
- if (Needs && State != Needs) {
- if (Needs == StateExact) {
- assert(!SavedWQMReg);
+ if (Needs) {
+ if (Needs != State) {
+ MachineBasicBlock::iterator Before =
+ prepareInsertion(MBB, First, II, Needs == StateWQM,
+ Needs == StateExact || WQMFromExec);
- if (!WQMFromExec && (OutNeeds & StateWQM))
- SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ if (Needs == StateExact) {
+ if (!WQMFromExec && (OutNeeds & StateWQM))
+ SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
- toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
- } else {
- assert(WQMFromExec == (SavedWQMReg == 0));
- toWQM(MBB, &MI, SavedWQMReg);
- SavedWQMReg = 0;
+ toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+ } else {
+ assert(WQMFromExec == (SavedWQMReg == 0));
+
+ toWQM(MBB, Before, SavedWQMReg);
+
+ if (SavedWQMReg) {
+ LIS->createAndComputeVirtRegInterval(SavedWQMReg);
+ SavedWQMReg = 0;
+ }
+ }
+
+ State = Needs;
}
- State = Needs;
+ First = IE;
}
- }
- if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
- assert(WQMFromExec == (SavedWQMReg == 0));
- toWQM(MBB, MBB.end(), SavedWQMReg);
- } else if (BI.OutNeeds == StateExact && State != StateExact) {
- toExact(MBB, MBB.end(), 0, LiveMaskReg);
+ if (II == IE)
+ break;
+ II = Next;
}
}
@@ -468,8 +657,11 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
for (MachineInstr *MI : LiveMaskQueries) {
const DebugLoc &DL = MI->getDebugLoc();
unsigned Dest = MI->getOperand(0).getReg();
- BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
- .addReg(LiveMaskReg);
+ MachineInstr *Copy =
+ BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
+ .addReg(LiveMaskReg);
+
+ LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
MI->eraseFromParent();
}
}
@@ -480,7 +672,6 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
Instructions.clear();
Blocks.clear();
- ExecExports.clear();
LiveMaskQueries.clear();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@@ -504,8 +695,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
- BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
- .addReg(AMDGPU::EXEC);
+ MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
+ TII->get(AMDGPU::COPY), LiveMaskReg)
+ .addReg(AMDGPU::EXEC);
+ LIS->InsertMachineInstrInMaps(*MI);
}
if (GlobalFlags == StateWQM) {
@@ -520,11 +713,18 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
}
}
+ DEBUG(printInfo());
+
lowerLiveMaskQueries(LiveMaskReg);
// Handle the general case
for (auto BII : Blocks)
processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+ // Physical registers like SCC aren't tracked by default anyway, so just
+ // removing the ranges we computed is the simplest option for maintaining
+ // the analysis results.
+ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+
return true;
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td
new file mode 100644
index 0000000..0265648
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -0,0 +1,535 @@
+//===---- SMInstructions.td - Scalar Memory Instruction Defintions --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def smrd_offset_8 : NamedOperandU32<"SMRDOffset8",
+ NamedMatchClass<"SMRDOffset8">> {
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def smrd_offset_20 : NamedOperandU32<"SMRDOffset20",
+ NamedMatchClass<"SMRDOffset20">> {
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+//===----------------------------------------------------------------------===//
+// Scalar Memory classes
+//===----------------------------------------------------------------------===//
+
+class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> :
+ InstSI <outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+
+ let LGKM_CNT = 1;
+ let SMRD = 1;
+ let mayStore = 0;
+ let mayLoad = 1;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+ let SchedRW = [WriteSMEM];
+ let SubtargetPredicate = isGCN;
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ bits<1> has_sbase = 1;
+ bits<1> has_sdst = 1;
+ bit has_glc = 0;
+ bits<1> has_offset = 1;
+ bits<1> offset_is_imm = 0;
+}
+
+class SM_Real <SM_Pseudo ps>
+ : InstSI<ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+
+ // encoding
+ bits<7> sbase;
+ bits<7> sdst;
+ bits<32> offset;
+ bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0);
+}
+
+class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]>
+ : SM_Pseudo<opName, outs, ins, asmOps, pattern> {
+ RegisterClass BaseClass;
+ let mayLoad = 1;
+ let mayStore = 0;
+ let has_glc = 1;
+}
+
+class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern = []>
+ : SM_Pseudo<opName, (outs), ins, asmOps, pattern> {
+ RegisterClass BaseClass;
+ RegisterClass SrcClass;
+ let mayLoad = 0;
+ let mayStore = 1;
+ let has_glc = 1;
+ let ScalarStore = 1;
+}
+
+multiclass SM_Pseudo_Loads<string opName,
+ RegisterClass baseClass,
+ RegisterClass dstClass> {
+ def _IMM : SM_Load_Pseudo <opName,
+ (outs dstClass:$sdst),
+ (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc),
+ " $sdst, $sbase, $offset$glc", []> {
+ let offset_is_imm = 1;
+ let BaseClass = baseClass;
+ let PseudoInstr = opName # "_IMM";
+ let has_glc = 1;
+ }
+
+ def _SGPR : SM_Load_Pseudo <opName,
+ (outs dstClass:$sdst),
+ (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc),
+ " $sdst, $sbase, $offset$glc", []> {
+ let BaseClass = baseClass;
+ let PseudoInstr = opName # "_SGPR";
+ let has_glc = 1;
+ }
+}
+
+multiclass SM_Pseudo_Stores<string opName,
+ RegisterClass baseClass,
+ RegisterClass srcClass> {
+ def _IMM : SM_Store_Pseudo <opName,
+ (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc),
+ " $sdata, $sbase, $offset$glc", []> {
+ let offset_is_imm = 1;
+ let BaseClass = baseClass;
+ let SrcClass = srcClass;
+ let PseudoInstr = opName # "_IMM";
+ }
+
+ def _SGPR : SM_Store_Pseudo <opName,
+ (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc),
+ " $sdata, $sbase, $offset$glc", []> {
+ let BaseClass = baseClass;
+ let SrcClass = srcClass;
+ let PseudoInstr = opName # "_SGPR";
+ }
+}
+
+class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
+ opName, (outs SReg_64_XEXEC:$sdst), (ins),
+ " $sdst", [(set i64:$sdst, (node))]> {
+ let hasSideEffects = 1;
+ // FIXME: mayStore = ? is a workaround for tablegen bug for different
+ // inferred mayStore flags for the instruction pattern vs. standalone
+ // Pat. Each considers the other contradictory.
+ let mayStore = ?;
+ let mayLoad = ?;
+ let has_sbase = 0;
+ let has_offset = 0;
+}
+
+class SM_Inval_Pseudo <string opName, SDPatternOperator node> : SM_Pseudo<
+ opName, (outs), (ins), "", [(node)]> {
+ let hasSideEffects = 1;
+ let mayStore = 1;
+ let has_sdst = 0;
+ let has_sbase = 0;
+ let has_offset = 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Scalar Memory Instructions
+//===----------------------------------------------------------------------===//
+
+// We are using the SReg_32_XM0 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SReg_32_XM0 register class does not include M0
+// and writing to M0 from an SMRD instruction will hang the GPU.
+
+// XXX - SMEM instructions do not allow exec for data operand, but
+// does sdst for SMRD on SI/CI?
+defm S_LOAD_DWORD : SM_Pseudo_Loads <"s_load_dword", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_load_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_load_dwordx4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <"s_load_dwordx8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <"s_load_dwordx16", SReg_64, SReg_512>;
+
+defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <
+ "s_buffer_load_dword", SReg_128, SReg_32_XM0_XEXEC
+>;
+
+// FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on
+// SI/CI, bit disallowed for SMEM on VI.
+defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <
+ "s_buffer_load_dwordx2", SReg_128, SReg_64_XEXEC
+>;
+
+defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <
+ "s_buffer_load_dwordx4", SReg_128, SReg_128
+>;
+
+defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <
+ "s_buffer_load_dwordx8", SReg_128, SReg_256
+>;
+
+defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <
+ "s_buffer_load_dwordx16", SReg_128, SReg_512
+>;
+
+defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>;
+
+defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores <
+ "s_buffer_store_dword", SReg_128, SReg_32_XM0_XEXEC
+>;
+
+defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <
+ "s_buffer_store_dwordx2", SReg_128, SReg_64_XEXEC
+>;
+
+defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <
+ "s_buffer_store_dwordx4", SReg_128, SReg_128
+>;
+
+
+def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>;
+def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>;
+
+let SubtargetPredicate = isCIVI in {
+def S_DCACHE_INV_VOL : SM_Inval_Pseudo <"s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>;
+} // let SubtargetPredicate = isCIVI
+
+let SubtargetPredicate = isVI in {
+def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>;
+def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
+def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
+} // SubtargetPredicate = isVI
+
+
+
+//===----------------------------------------------------------------------===//
+// Scalar Memory Patterns
+//===----------------------------------------------------------------------===//
+
+
+def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
+ auto Ld = cast<LoadSDNode>(N);
+ return Ld->getAlignment() >= 4 &&
+ ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
+ (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
+}]>;
+
+def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">;
+def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
+def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
+def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
+def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
+def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
+
+let Predicates = [isGCN] in {
+
+multiclass SMRD_Pattern <string Instr, ValueType vt> {
+
+ // 1. IMM offset
+ def : Pat <
+ (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
+ >;
+
+ // 2. SGPR offset
+ def : Pat <
+ (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
+ >;
+}
+
+let Predicates = [isSICI] in {
+def : Pat <
+ (i64 (readcyclecounter)),
+ (S_MEMTIME)
+>;
+}
+
+// Global and constant loads can be selected to either MUBUF or SMRD
+// instructions, but SMRD instructions are faster so we want the instruction
+// selector to prefer those.
+let AddedComplexity = 100 in {
+
+defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
+
+// 1. Offset as an immediate
+def SM_LOAD_PATTERN : Pat < // name this pattern to reuse AddedComplexity on CI
+ (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0)
+>;
+
+// 2. Offset loaded in an 32bit SGPR
+def : Pat <
+ (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)),
+ (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0)
+>;
+
+} // End let AddedComplexity = 100
+
+} // let Predicates = [isGCN]
+
+let Predicates = [isVI] in {
+
+// 1. Offset as 20bit DWORD immediate
+def : Pat <
+ (SIload_constant v4i32:$sbase, IMM20bit:$offset),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset), 0)
+>;
+
+def : Pat <
+ (i64 (readcyclecounter)),
+ (S_MEMREALTIME)
+>;
+
+} // let Predicates = [isVI]
+
+
+//===----------------------------------------------------------------------===//
+// Targets
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+class SMRD_Real_si <bits<5> op, SM_Pseudo ps>
+ : SM_Real<ps>
+ , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI>
+ , Enc32 {
+
+ let AssemblerPredicates = [isSICI];
+ let DecoderNamespace = "SICI";
+
+ let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?);
+ let Inst{8} = imm;
+ let Inst{14-9} = !if(ps.has_sbase, sbase{6-1}, ?);
+ let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?);
+ let Inst{26-22} = op;
+ let Inst{31-27} = 0x18; //encoding
+}
+
+// FIXME: Assembler should reject trying to use glc on SMRD
+// instructions on SI.
+multiclass SM_Real_Loads_si<bits<5> op, string ps,
+ SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
+ SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
+
+ def _IMM_si : SMRD_Real_si <op, immPs> {
+ let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc);
+ }
+
+ // FIXME: The operand name $offset is inconsistent with $soff used
+ // in the pseudo
+ def _SGPR_si : SMRD_Real_si <op, sgprPs> {
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+ }
+
+}
+
+defm S_LOAD_DWORD : SM_Real_Loads_si <0x00, "S_LOAD_DWORD">;
+defm S_LOAD_DWORDX2 : SM_Real_Loads_si <0x01, "S_LOAD_DWORDX2">;
+defm S_LOAD_DWORDX4 : SM_Real_Loads_si <0x02, "S_LOAD_DWORDX4">;
+defm S_LOAD_DWORDX8 : SM_Real_Loads_si <0x03, "S_LOAD_DWORDX8">;
+defm S_LOAD_DWORDX16 : SM_Real_Loads_si <0x04, "S_LOAD_DWORDX16">;
+defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_si <0x08, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_si <0x09, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_si <0x0a, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_si <0x0b, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_si <0x0c, "S_BUFFER_LOAD_DWORDX16">;
+
+def S_MEMTIME_si : SMRD_Real_si <0x1e, S_MEMTIME>;
+def S_DCACHE_INV_si : SMRD_Real_si <0x1f, S_DCACHE_INV>;
+
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
+ : SM_Real<ps>
+ , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI>
+ , Enc64 {
+ bit glc;
+
+ let AssemblerPredicates = [isVI];
+ let DecoderNamespace = "VI";
+
+ let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
+ let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
+
+ let Inst{16} = !if(ps.has_glc, glc, ?);
+ let Inst{17} = imm;
+ let Inst{25-18} = op;
+ let Inst{31-26} = 0x30; //encoding
+ let Inst{51-32} = !if(ps.has_offset, offset{19-0}, ?);
+}
+
+multiclass SM_Real_Loads_vi<bits<8> op, string ps,
+ SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
+ SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
+ def _IMM_vi : SMEM_Real_vi <op, immPs> {
+ let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc);
+ }
+ def _SGPR_vi : SMEM_Real_vi <op, sgprPs> {
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+ }
+}
+
+class SMEM_Real_Store_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> {
+ // encoding
+ bits<7> sdata;
+
+ let sdst = ?;
+ let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?);
+}
+
+multiclass SM_Real_Stores_vi<bits<8> op, string ps,
+ SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM),
+ SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> {
+ // FIXME: The operand name $offset is inconsistent with $soff used
+ // in the pseudo
+ def _IMM_vi : SMEM_Real_Store_vi <op, immPs> {
+ let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc);
+ }
+
+ def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> {
+ let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+ }
+}
+
+defm S_LOAD_DWORD : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">;
+defm S_LOAD_DWORDX2 : SM_Real_Loads_vi <0x01, "S_LOAD_DWORDX2">;
+defm S_LOAD_DWORDX4 : SM_Real_Loads_vi <0x02, "S_LOAD_DWORDX4">;
+defm S_LOAD_DWORDX8 : SM_Real_Loads_vi <0x03, "S_LOAD_DWORDX8">;
+defm S_LOAD_DWORDX16 : SM_Real_Loads_vi <0x04, "S_LOAD_DWORDX16">;
+defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_vi <0x08, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_vi <0x09, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_vi <0x0a, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_vi <0x0b, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_vi <0x0c, "S_BUFFER_LOAD_DWORDX16">;
+
+defm S_STORE_DWORD : SM_Real_Stores_vi <0x10, "S_STORE_DWORD">;
+defm S_STORE_DWORDX2 : SM_Real_Stores_vi <0x11, "S_STORE_DWORDX2">;
+defm S_STORE_DWORDX4 : SM_Real_Stores_vi <0x12, "S_STORE_DWORDX4">;
+
+defm S_BUFFER_STORE_DWORD : SM_Real_Stores_vi <0x18, "S_BUFFER_STORE_DWORD">;
+defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_vi <0x19, "S_BUFFER_STORE_DWORDX2">;
+defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_vi <0x1a, "S_BUFFER_STORE_DWORDX4">;
+
+// These instructions use same encoding
+def S_DCACHE_INV_vi : SMEM_Real_vi <0x20, S_DCACHE_INV>;
+def S_DCACHE_WB_vi : SMEM_Real_vi <0x21, S_DCACHE_WB>;
+def S_DCACHE_INV_VOL_vi : SMEM_Real_vi <0x22, S_DCACHE_INV_VOL>;
+def S_DCACHE_WB_VOL_vi : SMEM_Real_vi <0x23, S_DCACHE_WB_VOL>;
+def S_MEMTIME_vi : SMEM_Real_vi <0x24, S_MEMTIME>;
+def S_MEMREALTIME_vi : SMEM_Real_vi <0x25, S_MEMREALTIME>;
+
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+def smrd_literal_offset : NamedOperandU32<"SMRDLiteralOffset",
+ NamedMatchClass<"SMRDLiteralOffset">> {
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> :
+ SM_Real<ps>,
+ Enc64 {
+
+ let AssemblerPredicates = [isCIOnly];
+ let DecoderNamespace = "CI";
+ let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc);
+
+ let LGKM_CNT = ps.LGKM_CNT;
+ let SMRD = ps.SMRD;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let hasSideEffects = ps.hasSideEffects;
+ let SchedRW = ps.SchedRW;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+
+ let Inst{7-0} = 0xff;
+ let Inst{8} = 0;
+ let Inst{14-9} = sbase{6-1};
+ let Inst{21-15} = sdst{6-0};
+ let Inst{26-22} = op;
+ let Inst{31-27} = 0x18; //encoding
+ let Inst{63-32} = offset{31-0};
+}
+
+def S_LOAD_DWORD_IMM_ci : SMRD_Real_Load_IMM_ci <0x00, S_LOAD_DWORD_IMM>;
+def S_LOAD_DWORDX2_IMM_ci : SMRD_Real_Load_IMM_ci <0x01, S_LOAD_DWORDX2_IMM>;
+def S_LOAD_DWORDX4_IMM_ci : SMRD_Real_Load_IMM_ci <0x02, S_LOAD_DWORDX4_IMM>;
+def S_LOAD_DWORDX8_IMM_ci : SMRD_Real_Load_IMM_ci <0x03, S_LOAD_DWORDX8_IMM>;
+def S_LOAD_DWORDX16_IMM_ci : SMRD_Real_Load_IMM_ci <0x04, S_LOAD_DWORDX16_IMM>;
+def S_BUFFER_LOAD_DWORD_IMM_ci : SMRD_Real_Load_IMM_ci <0x08, S_BUFFER_LOAD_DWORD_IMM>;
+def S_BUFFER_LOAD_DWORDX2_IMM_ci : SMRD_Real_Load_IMM_ci <0x09, S_BUFFER_LOAD_DWORDX2_IMM>;
+def S_BUFFER_LOAD_DWORDX4_IMM_ci : SMRD_Real_Load_IMM_ci <0x0a, S_BUFFER_LOAD_DWORDX4_IMM>;
+def S_BUFFER_LOAD_DWORDX8_IMM_ci : SMRD_Real_Load_IMM_ci <0x0b, S_BUFFER_LOAD_DWORDX8_IMM>;
+def S_BUFFER_LOAD_DWORDX16_IMM_ci : SMRD_Real_Load_IMM_ci <0x0c, S_BUFFER_LOAD_DWORDX16_IMM>;
+
+class SMRD_Real_ci <bits<5> op, SM_Pseudo ps>
+ : SM_Real<ps>
+ , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI>
+ , Enc32 {
+
+ let AssemblerPredicates = [isCIOnly];
+ let DecoderNamespace = "CI";
+
+ let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?);
+ let Inst{8} = imm;
+ let Inst{14-9} = !if(ps.has_sbase, sbase{6-1}, ?);
+ let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?);
+ let Inst{26-22} = op;
+ let Inst{31-27} = 0x18; //encoding
+}
+
+def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
+
+let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
+
+class SMRD_Pattern_ci <string Instr, ValueType vt> : Pat <
+ (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
+ let Predicates = [isCIOnly];
+}
+
+def : SMRD_Pattern_ci <"S_LOAD_DWORD", i32>;
+def : SMRD_Pattern_ci <"S_LOAD_DWORDX2", v2i32>;
+def : SMRD_Pattern_ci <"S_LOAD_DWORDX4", v4i32>;
+def : SMRD_Pattern_ci <"S_LOAD_DWORDX8", v8i32>;
+def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>;
+
+def : Pat <
+ (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
+ (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> {
+ let Predicates = [isCI]; // should this be isCIOnly?
+}
+
+} // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td
new file mode 100644
index 0000000..73cd577
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -0,0 +1,1232 @@
+//===-- SOPInstructions.td - SOP Instruction Defintions -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def GPRIdxModeMatchClass : AsmOperandClass {
+ let Name = "GPRIdxMode";
+ let PredicateMethod = "isGPRIdxMode";
+ let RenderMethod = "addImmOperands";
+}
+
+def GPRIdxMode : Operand<i32> {
+ let PrintMethod = "printVGPRIndexMode";
+ let ParserMatchClass = GPRIdxModeMatchClass;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+//===----------------------------------------------------------------------===//
+// SOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+class SOP1_Pseudo <string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ InstSI <outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let SubtargetPredicate = isGCN;
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SALU = 1;
+ let SOP1 = 1;
+ let SchedRW = [WriteSALU];
+ let Size = 4;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ bits<1> has_src0 = 1;
+ bits<1> has_sdst = 1;
+}
+
+class SOP1_Real<bits<8> op, SOP1_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList,
+ ps.Mnemonic # " " # ps.AsmOperands, []>,
+ Enc32 {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+ let Size = 4;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+
+ // encoding
+ bits<7> sdst;
+ bits<8> src0;
+
+ let Inst{7-0} = !if(ps.has_src0, src0, ?);
+ let Inst{15-8} = op;
+ let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+ let Inst{31-23} = 0x17d; //encoding;
+}
+
+class SOP1_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0),
+ "$sdst, $src0", pattern
+>;
+
+// 32-bit input, no output.
+class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo <
+ opName, (outs), (ins SSrc_b32:$src0),
+ "$src0", pattern> {
+ let has_sdst = 0;
+}
+
+class SOP1_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0),
+ "$sdst, $src0", pattern
+>;
+
+// 64-bit input, 32-bit output.
+class SOP1_32_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs SReg_32:$sdst), (ins SSrc_b64:$src0),
+ "$sdst, $src0", pattern
+>;
+
+// 32-bit input, 64-bit output.
+class SOP1_64_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs SReg_64:$sdst), (ins SSrc_b32:$src0),
+ "$sdst, $src0", pattern
+>;
+
+// no input, 64-bit output.
+class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs SReg_64:$sdst), (ins), "$sdst", pattern> {
+ let has_src0 = 0;
+}
+
+// 64-bit input, no output
+class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs), (ins SReg_64:$src0), "$src0", pattern> {
+ let has_sdst = 0;
+}
+
+
+let isMoveImm = 1 in {
+ let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+ def S_MOV_B32 : SOP1_32 <"s_mov_b32">;
+ def S_MOV_B64 : SOP1_64 <"s_mov_b64">;
+ } // End isRematerializeable = 1
+
+ let Uses = [SCC] in {
+ def S_CMOV_B32 : SOP1_32 <"s_cmov_b32">;
+ def S_CMOV_B64 : SOP1_64 <"s_cmov_b64">;
+ } // End Uses = [SCC]
+} // End isMoveImm = 1
+
+let Defs = [SCC] in {
+ def S_NOT_B32 : SOP1_32 <"s_not_b32",
+ [(set i32:$sdst, (not i32:$src0))]
+ >;
+
+ def S_NOT_B64 : SOP1_64 <"s_not_b64",
+ [(set i64:$sdst, (not i64:$src0))]
+ >;
+ def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
+ def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
+} // End Defs = [SCC]
+
+
+def S_BREV_B32 : SOP1_32 <"s_brev_b32",
+ [(set i32:$sdst, (bitreverse i32:$src0))]
+>;
+def S_BREV_B64 : SOP1_64 <"s_brev_b64">;
+
+let Defs = [SCC] in {
+def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
+def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
+def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
+ [(set i32:$sdst, (ctpop i32:$src0))]
+>;
+def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64">;
+} // End Defs = [SCC]
+
+def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
+def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
+def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
+ [(set i32:$sdst, (cttz_zero_undef i32:$src0))]
+>;
+def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">;
+
+def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
+ [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
+>;
+
+def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64">;
+def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
+ [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))]
+>;
+def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
+def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
+ [(set i32:$sdst, (sext_inreg i32:$src0, i8))]
+>;
+def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16",
+ [(set i32:$sdst, (sext_inreg i32:$src0, i16))]
+>;
+
+def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">;
+def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">;
+def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">;
+def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
+def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">;
+
+let isTerminator = 1, isBarrier = 1,
+ isBranch = 1, isIndirectBranch = 1 in {
+def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">;
+}
+def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">;
+def S_RFE_B64 : SOP1_1 <"s_rfe_b64">;
+
+let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
+
+def S_AND_SAVEEXEC_B64 : SOP1_64 <"s_and_saveexec_b64">;
+def S_OR_SAVEEXEC_B64 : SOP1_64 <"s_or_saveexec_b64">;
+def S_XOR_SAVEEXEC_B64 : SOP1_64 <"s_xor_saveexec_b64">;
+def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <"s_andn2_saveexec_b64">;
+def S_ORN2_SAVEEXEC_B64 : SOP1_64 <"s_orn2_saveexec_b64">;
+def S_NAND_SAVEEXEC_B64 : SOP1_64 <"s_nand_saveexec_b64">;
+def S_NOR_SAVEEXEC_B64 : SOP1_64 <"s_nor_saveexec_b64">;
+def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">;
+
+} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
+
+def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32">;
+def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64">;
+
+let Uses = [M0] in {
+def S_MOVRELS_B32 : SOP1_32 <"s_movrels_b32">;
+def S_MOVRELS_B64 : SOP1_64 <"s_movrels_b64">;
+def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">;
+def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">;
+} // End Uses = [M0]
+
+def S_CBRANCH_JOIN : SOP1_1 <"s_cbranch_join">;
+def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">;
+let Defs = [SCC] in {
+def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
+} // End Defs = [SCC]
+def S_MOV_FED_B32 : SOP1_32 <"s_mov_fed_b32">;
+
+let SubtargetPredicate = HasVGPRIndexMode in {
+def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
+ let Uses = [M0];
+ let Defs = [M0];
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+class SOP2_Pseudo<string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ InstSI<outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let SubtargetPredicate = isGCN;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SALU = 1;
+ let SOP2 = 1;
+ let SchedRW = [WriteSALU];
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ bits<1> has_sdst = 1;
+
+ // Pseudo instructions have no encodings, but adding this field here allows
+ // us to do:
+ // let sdst = xxx in {
+ // for multiclasses that include both real and pseudo instructions.
+ // field bits<7> sdst = 0;
+ // let Size = 4; // Do we need size here?
+}
+
+class SOP2_Real<bits<7> op, SOP2_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList,
+ ps.Mnemonic # " " # ps.AsmOperands, []>,
+ Enc32 {
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+
+ // encoding
+ bits<7> sdst;
+ bits<8> src0;
+ bits<8> src1;
+
+ let Inst{7-0} = src0;
+ let Inst{15-8} = src1;
+ let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+ let Inst{29-23} = op;
+ let Inst{31-30} = 0x2; // encoding
+}
+
+
+class SOP2_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+ opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0, SSrc_b32:$src1),
+ "$sdst, $src0, $src1", pattern
+>;
+
+class SOP2_64 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+ opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+ "$sdst, $src0, $src1", pattern
+>;
+
+class SOP2_64_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+ opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b32:$src1),
+ "$sdst, $src0, $src1", pattern
+>;
+
+class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+ opName, (outs SReg_64:$sdst), (ins SSrc_b32:$src0, SSrc_b32:$src1),
+ "$sdst, $src0, $src1", pattern
+>;
+
+let Defs = [SCC] in { // Carry out goes to SCC
+let isCommutable = 1 in {
+def S_ADD_U32 : SOP2_32 <"s_add_u32">;
+def S_ADD_I32 : SOP2_32 <"s_add_i32",
+ [(set i32:$sdst, (add SSrc_b32:$src0, SSrc_b32:$src1))]
+>;
+} // End isCommutable = 1
+
+def S_SUB_U32 : SOP2_32 <"s_sub_u32">;
+def S_SUB_I32 : SOP2_32 <"s_sub_i32",
+ [(set i32:$sdst, (sub SSrc_b32:$src0, SSrc_b32:$src1))]
+>;
+
+let Uses = [SCC] in { // Carry in comes from SCC
+let isCommutable = 1 in {
+def S_ADDC_U32 : SOP2_32 <"s_addc_u32",
+ [(set i32:$sdst, (adde (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
+} // End isCommutable = 1
+
+def S_SUBB_U32 : SOP2_32 <"s_subb_u32",
+ [(set i32:$sdst, (sube (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
+} // End Uses = [SCC]
+
+
+let isCommutable = 1 in {
+def S_MIN_I32 : SOP2_32 <"s_min_i32",
+ [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
+>;
+def S_MIN_U32 : SOP2_32 <"s_min_u32",
+ [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
+>;
+def S_MAX_I32 : SOP2_32 <"s_max_i32",
+ [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
+>;
+def S_MAX_U32 : SOP2_32 <"s_max_u32",
+ [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
+>;
+} // End isCommutable = 1
+} // End Defs = [SCC]
+
+
+let Uses = [SCC] in {
+ def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">;
+ def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
+} // End Uses = [SCC]
+
+let Defs = [SCC] in {
+let isCommutable = 1 in {
+def S_AND_B32 : SOP2_32 <"s_and_b32",
+ [(set i32:$sdst, (and i32:$src0, i32:$src1))]
+>;
+
+def S_AND_B64 : SOP2_64 <"s_and_b64",
+ [(set i64:$sdst, (and i64:$src0, i64:$src1))]
+>;
+
+def S_OR_B32 : SOP2_32 <"s_or_b32",
+ [(set i32:$sdst, (or i32:$src0, i32:$src1))]
+>;
+
+def S_OR_B64 : SOP2_64 <"s_or_b64",
+ [(set i64:$sdst, (or i64:$src0, i64:$src1))]
+>;
+
+def S_XOR_B32 : SOP2_32 <"s_xor_b32",
+ [(set i32:$sdst, (xor i32:$src0, i32:$src1))]
+>;
+
+def S_XOR_B64 : SOP2_64 <"s_xor_b64",
+ [(set i64:$sdst, (xor i64:$src0, i64:$src1))]
+>;
+} // End isCommutable = 1
+
+def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">;
+def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">;
+def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">;
+def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">;
+def S_NAND_B32 : SOP2_32 <"s_nand_b32">;
+def S_NAND_B64 : SOP2_64 <"s_nand_b64">;
+def S_NOR_B32 : SOP2_32 <"s_nor_b32">;
+def S_NOR_B64 : SOP2_64 <"s_nor_b64">;
+def S_XNOR_B32 : SOP2_32 <"s_xnor_b32">;
+def S_XNOR_B64 : SOP2_64 <"s_xnor_b64">;
+} // End Defs = [SCC]
+
+// Use added complexity so these patterns are preferred to the VALU patterns.
+let AddedComplexity = 1 in {
+
+let Defs = [SCC] in {
+def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
+ [(set i32:$sdst, (shl i32:$src0, i32:$src1))]
+>;
+def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
+ [(set i64:$sdst, (shl i64:$src0, i32:$src1))]
+>;
+def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
+ [(set i32:$sdst, (srl i32:$src0, i32:$src1))]
+>;
+def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
+ [(set i64:$sdst, (srl i64:$src0, i32:$src1))]
+>;
+def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
+ [(set i32:$sdst, (sra i32:$src0, i32:$src1))]
+>;
+def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
+ [(set i64:$sdst, (sra i64:$src0, i32:$src1))]
+>;
+} // End Defs = [SCC]
+
+def S_BFM_B32 : SOP2_32 <"s_bfm_b32",
+ [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
+def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">;
+def S_MUL_I32 : SOP2_32 <"s_mul_i32",
+ [(set i32:$sdst, (mul i32:$src0, i32:$src1))]> {
+ let isCommutable = 1;
+}
+
+} // End AddedComplexity = 1
+
+let Defs = [SCC] in {
+def S_BFE_U32 : SOP2_32 <"s_bfe_u32">;
+def S_BFE_I32 : SOP2_32 <"s_bfe_i32">;
+def S_BFE_U64 : SOP2_64_32 <"s_bfe_u64">;
+def S_BFE_I64 : SOP2_64_32 <"s_bfe_i64">;
+} // End Defs = [SCC]
+
+def S_CBRANCH_G_FORK : SOP2_Pseudo <
+ "s_cbranch_g_fork", (outs),
+ (ins SReg_64:$src0, SReg_64:$src1),
+ "$src0, $src1"
+> {
+ let has_sdst = 0;
+}
+
+let Defs = [SCC] in {
+def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
+} // End Defs = [SCC]
+
+
+//===----------------------------------------------------------------------===//
+// SOPK Instructions
+//===----------------------------------------------------------------------===//
+
+class SOPK_Pseudo <string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ InstSI <outs, ins, "", pattern>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let SubtargetPredicate = isGCN;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SALU = 1;
+ let SOPK = 1;
+ let SchedRW = [WriteSALU];
+ let UseNamedOperandTable = 1;
+ string Mnemonic = opName;
+ string AsmOperands = asmOps;
+
+ bits<1> has_sdst = 1;
+}
+
+class SOPK_Real<bits<5> op, SOPK_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList,
+ ps.Mnemonic # " " # ps.AsmOperands, []> {
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let DisableEncoding = ps.DisableEncoding;
+ let Constraints = ps.Constraints;
+
+ // encoding
+ bits<7> sdst;
+ bits<16> simm16;
+ bits<32> imm;
+}
+
+class SOPK_Real32<bits<5> op, SOPK_Pseudo ps> :
+ SOPK_Real <op, ps>,
+ Enc32 {
+ let Inst{15-0} = simm16;
+ let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+ let Inst{27-23} = op;
+ let Inst{31-28} = 0xb; //encoding
+}
+
+class SOPK_Real64<bits<5> op, SOPK_Pseudo ps> :
+ SOPK_Real<op, ps>,
+ Enc64 {
+ let Inst{15-0} = simm16;
+ let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+ let Inst{27-23} = op;
+ let Inst{31-28} = 0xb; //encoding
+ let Inst{63-32} = imm;
+}
+
+class SOPKInstTable <bit is_sopk, string cmpOp = ""> {
+ bit IsSOPK = is_sopk;
+ string BaseCmpOp = cmpOp;
+}
+
+class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
+ opName,
+ (outs SReg_32:$sdst),
+ (ins u16imm:$simm16),
+ "$sdst, $simm16",
+ pattern>;
+
+class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo <
+ opName,
+ (outs),
+ (ins SReg_32:$sdst, u16imm:$simm16),
+ "$sdst, $simm16", []>,
+ SOPKInstTable<1, base_op>{
+ let Defs = [SCC];
+}
+
+class SOPK_32TIE <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
+ opName,
+ (outs SReg_32:$sdst),
+ (ins SReg_32:$src0, u16imm:$simm16),
+ "$sdst, $simm16",
+ pattern
+>;
+
+let isReMaterializable = 1, isMoveImm = 1 in {
+def S_MOVK_I32 : SOPK_32 <"s_movk_i32">;
+} // End isReMaterializable = 1
+let Uses = [SCC] in {
+def S_CMOVK_I32 : SOPK_32 <"s_cmovk_i32">;
+}
+
+let isCompare = 1 in {
+
+// This instruction is disabled for now until we can figure out how to teach
+// the instruction selector to correctly use the S_CMP* vs V_CMP*
+// instructions.
+//
+// When this instruction is enabled the code generator sometimes produces this
+// invalid sequence:
+//
+// SCC = S_CMPK_EQ_I32 SGPR0, imm
+// VCC = COPY SCC
+// VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
+//
+// def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32",
+// [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
+// >;
+
+def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32">;
+def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32">;
+def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32">;
+def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32">;
+def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32">;
+def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32">;
+
+let SOPKZext = 1 in {
+def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32">;
+def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32">;
+def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32">;
+def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32">;
+def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32">;
+def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32">;
+} // End SOPKZext = 1
+} // End isCompare = 1
+
+let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0",
+ Constraints = "$sdst = $src0" in {
+ def S_ADDK_I32 : SOPK_32TIE <"s_addk_i32">;
+ def S_MULK_I32 : SOPK_32TIE <"s_mulk_i32">;
+}
+
+def S_CBRANCH_I_FORK : SOPK_Pseudo <
+ "s_cbranch_i_fork",
+ (outs), (ins SReg_64:$sdst, u16imm:$simm16),
+ "$sdst, $simm16"
+>;
+
+let mayLoad = 1 in {
+def S_GETREG_B32 : SOPK_Pseudo <
+ "s_getreg_b32",
+ (outs SReg_32:$sdst), (ins hwreg:$simm16),
+ "$sdst, $simm16"
+>;
+}
+
+let hasSideEffects = 1 in {
+
+def S_SETREG_B32 : SOPK_Pseudo <
+ "s_setreg_b32",
+ (outs), (ins SReg_32:$sdst, hwreg:$simm16),
+ "$simm16, $sdst",
+ [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))]
+>;
+
+// FIXME: Not on SI?
+//def S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32">;
+
+def S_SETREG_IMM32_B32 : SOPK_Pseudo <
+ "s_setreg_imm32_b32",
+ (outs), (ins i32imm:$imm, hwreg:$simm16),
+ "$simm16, $imm"> {
+ let Size = 8; // Unlike every other SOPK instruction.
+ let has_sdst = 0;
+}
+
+} // End hasSideEffects = 1
+
+//===----------------------------------------------------------------------===//
+// SOPC Instructions
+//===----------------------------------------------------------------------===//
+
+class SOPCe <bits<7> op> : Enc32 {
+ bits<8> src0;
+ bits<8> src1;
+
+ let Inst{7-0} = src0;
+ let Inst{15-8} = src1;
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x17e;
+}
+
+class SOPC <bits<7> op, dag outs, dag ins, string asm,
+ list<dag> pattern = []> :
+ InstSI<outs, ins, asm, pattern>, SOPCe <op> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SALU = 1;
+ let SOPC = 1;
+ let isCodeGenOnly = 0;
+ let Defs = [SCC];
+ let SchedRW = [WriteSALU];
+ let UseNamedOperandTable = 1;
+ let SubtargetPredicate = isGCN;
+}
+
+class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
+ string opName, list<dag> pattern = []> : SOPC <
+ op, (outs), (ins rc0:$src0, rc1:$src1),
+ opName#" $src0, $src1", pattern > {
+ let Defs = [SCC];
+}
+class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
+ string opName, PatLeaf cond> : SOPC_Base <
+ op, rc, rc, opName,
+ [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > {
+}
+
+class SOPC_CMP_32<bits<7> op, string opName,
+ PatLeaf cond = COND_NULL, string revOp = opName>
+ : SOPC_Helper<op, SSrc_b32, i32, opName, cond>,
+ Commutable_REV<revOp, !eq(revOp, opName)>,
+ SOPKInstTable<0, opName> {
+ let isCompare = 1;
+ let isCommutable = 1;
+}
+
+class SOPC_CMP_64<bits<7> op, string opName,
+ PatLeaf cond = COND_NULL, string revOp = opName>
+ : SOPC_Helper<op, SSrc_b64, i64, opName, cond>,
+ Commutable_REV<revOp, !eq(revOp, opName)> {
+ let isCompare = 1;
+ let isCommutable = 1;
+}
+
+class SOPC_32<bits<7> op, string opName, list<dag> pattern = []>
+ : SOPC_Base<op, SSrc_b32, SSrc_b32, opName, pattern>;
+
+class SOPC_64_32<bits<7> op, string opName, list<dag> pattern = []>
+ : SOPC_Base<op, SSrc_b64, SSrc_b32, opName, pattern>;
+
+def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00, "s_cmp_eq_i32">;
+def S_CMP_LG_I32 : SOPC_CMP_32 <0x01, "s_cmp_lg_i32">;
+def S_CMP_GT_I32 : SOPC_CMP_32 <0x02, "s_cmp_gt_i32", COND_SGT>;
+def S_CMP_GE_I32 : SOPC_CMP_32 <0x03, "s_cmp_ge_i32", COND_SGE>;
+def S_CMP_LT_I32 : SOPC_CMP_32 <0x04, "s_cmp_lt_i32", COND_SLT, "s_cmp_gt_i32">;
+def S_CMP_LE_I32 : SOPC_CMP_32 <0x05, "s_cmp_le_i32", COND_SLE, "s_cmp_ge_i32">;
+def S_CMP_EQ_U32 : SOPC_CMP_32 <0x06, "s_cmp_eq_u32", COND_EQ>;
+def S_CMP_LG_U32 : SOPC_CMP_32 <0x07, "s_cmp_lg_u32", COND_NE>;
+def S_CMP_GT_U32 : SOPC_CMP_32 <0x08, "s_cmp_gt_u32", COND_UGT>;
+def S_CMP_GE_U32 : SOPC_CMP_32 <0x09, "s_cmp_ge_u32", COND_UGE>;
+def S_CMP_LT_U32 : SOPC_CMP_32 <0x0a, "s_cmp_lt_u32", COND_ULT, "s_cmp_gt_u32">;
+def S_CMP_LE_U32 : SOPC_CMP_32 <0x0b, "s_cmp_le_u32", COND_ULE, "s_cmp_ge_u32">;
+
+def S_BITCMP0_B32 : SOPC_32 <0x0c, "s_bitcmp0_b32">;
+def S_BITCMP1_B32 : SOPC_32 <0x0d, "s_bitcmp1_b32">;
+def S_BITCMP0_B64 : SOPC_64_32 <0x0e, "s_bitcmp0_b64">;
+def S_BITCMP1_B64 : SOPC_64_32 <0x0f, "s_bitcmp1_b64">;
+def S_SETVSKIP : SOPC_32 <0x10, "s_setvskip">;
+
+let SubtargetPredicate = isVI in {
+def S_CMP_EQ_U64 : SOPC_CMP_64 <0x12, "s_cmp_eq_u64", COND_EQ>;
+def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>;
+}
+
+let SubtargetPredicate = HasVGPRIndexMode in {
+def S_SET_GPR_IDX_ON : SOPC <0x11,
+ (outs),
+ (ins SSrc_b32:$src0, GPRIdxMode:$src1),
+ "s_set_gpr_idx_on $src0,$src1"> {
+ let Defs = [M0]; // No scc def
+ let Uses = [M0]; // Other bits of m0 unmodified.
+ let hasSideEffects = 1; // Sets mode.gpr_idx_en
+ let FixedSize = 1;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SOPP Instructions
+//===----------------------------------------------------------------------===//
+
+class SOPPe <bits<7> op> : Enc32 {
+ bits <16> simm16;
+
+ let Inst{15-0} = simm16;
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x17f; // encoding
+}
+
+class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
+ InstSI <(outs), ins, asm, pattern >, SOPPe <op> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SALU = 1;
+ let SOPP = 1;
+ let Size = 4;
+ let SchedRW = [WriteSALU];
+
+ let UseNamedOperandTable = 1;
+ let SubtargetPredicate = isGCN;
+}
+
+
+def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
+
+let isTerminator = 1 in {
+
+def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
+ [(AMDGPUendpgm)]> {
+ let simm16 = 0;
+ let isBarrier = 1;
+ let isReturn = 1;
+}
+
+let isBranch = 1, SchedRW = [WriteBranch] in {
+def S_BRANCH : SOPP <
+ 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
+ [(br bb:$simm16)]> {
+ let isBarrier = 1;
+}
+
+let Uses = [SCC] in {
+def S_CBRANCH_SCC0 : SOPP <
+ 0x00000004, (ins sopp_brtarget:$simm16),
+ "s_cbranch_scc0 $simm16"
+>;
+def S_CBRANCH_SCC1 : SOPP <
+ 0x00000005, (ins sopp_brtarget:$simm16),
+ "s_cbranch_scc1 $simm16",
+ [(si_uniform_br_scc SCC, bb:$simm16)]
+>;
+} // End Uses = [SCC]
+
+let Uses = [VCC] in {
+def S_CBRANCH_VCCZ : SOPP <
+ 0x00000006, (ins sopp_brtarget:$simm16),
+ "s_cbranch_vccz $simm16"
+>;
+def S_CBRANCH_VCCNZ : SOPP <
+ 0x00000007, (ins sopp_brtarget:$simm16),
+ "s_cbranch_vccnz $simm16"
+>;
+} // End Uses = [VCC]
+
+let Uses = [EXEC] in {
+def S_CBRANCH_EXECZ : SOPP <
+ 0x00000008, (ins sopp_brtarget:$simm16),
+ "s_cbranch_execz $simm16"
+>;
+def S_CBRANCH_EXECNZ : SOPP <
+ 0x00000009, (ins sopp_brtarget:$simm16),
+ "s_cbranch_execnz $simm16"
+>;
+} // End Uses = [EXEC]
+
+
+} // End isBranch = 1
+} // End isTerminator = 1
+
+let hasSideEffects = 1 in {
+def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
+ [(int_amdgcn_s_barrier)]> {
+ let SchedRW = [WriteBarrier];
+ let simm16 = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let isConvergent = 1;
+}
+
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
+def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
+
+// On SI the documentation says sleep for approximately 64 * low 2
+// bits, consistent with the reported maximum of 448. On VI the
+// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the
+// maximum really 15 on VI?
+def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
+ "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> {
+ let hasSideEffects = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
+
+let Uses = [EXEC, M0] in {
+// FIXME: Should this be mayLoad+mayStore?
+def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
+ [(AMDGPUsendmsg (i32 imm:$simm16))]
+>;
+
+def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16",
+ [(AMDGPUsendmsghalt (i32 imm:$simm16))]
+>;
+} // End Uses = [EXEC, M0]
+
+def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
+def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
+ let simm16 = 0;
+}
+def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16",
+ [(int_amdgcn_s_incperflevel SIMM16bit:$simm16)]> {
+ let hasSideEffects = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16",
+ [(int_amdgcn_s_decperflevel SIMM16bit:$simm16)]> {
+ let hasSideEffects = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
+ let simm16 = 0;
+}
+
+let SubtargetPredicate = HasVGPRIndexMode in {
+def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> {
+ let simm16 = 0;
+}
+}
+} // End hasSideEffects
+
+let SubtargetPredicate = HasVGPRIndexMode in {
+def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16),
+ "s_set_gpr_idx_mode$simm16"> {
+ let Defs = [M0];
+}
+}
+
+let Predicates = [isGCN] in {
+
+//===----------------------------------------------------------------------===//
+// S_GETREG_B32 Intrinsic Pattern.
+//===----------------------------------------------------------------------===//
+def : Pat <
+ (int_amdgcn_s_getreg imm:$simm16),
+ (S_GETREG_B32 (as_i16imm $simm16))
+>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+ (i64 (ctpop i64:$src)),
+ (i64 (REG_SEQUENCE SReg_64,
+ (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
+ (S_MOV_B32 (i32 0)), sub1))
+>;
+
+def : Pat <
+ (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
+ (S_ABS_I32 $x)
+>;
+
+def : Pat <
+ (i16 imm:$imm),
+ (S_MOV_B32 imm:$imm)
+>;
+
+// Same as a 32-bit inreg
+def : Pat<
+ (i32 (sext i16:$src)),
+ (S_SEXT_I32_I16 $src)
+>;
+
+
+//===----------------------------------------------------------------------===//
+// SOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
+// case, the sgpr-copies pass will fix this to use the vector version.
+def : Pat <
+ (i32 (addc i32:$src0, i32:$src1)),
+ (S_ADD_U32 $src0, $src1)
+>;
+
+// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
+// REG_SEQUENCE patterns don't support instructions with multiple
+// outputs.
+def : Pat<
+ (i64 (zext i16:$src)),
+ (REG_SEQUENCE SReg_64,
+ (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0,
+ (S_MOV_B32 (i32 0)), sub1)
+>;
+
+def : Pat <
+ (i64 (sext i16:$src)),
+ (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0,
+ (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1)
+>;
+
+def : Pat<
+ (i32 (zext i16:$src)),
+ (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
+>;
+
+
+
+//===----------------------------------------------------------------------===//
+// SOPP Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+ (int_amdgcn_s_waitcnt i32:$simm16),
+ (S_WAITCNT (as_i16imm $simm16))
+>;
+
+} // End isGCN predicate
+
+
+//===----------------------------------------------------------------------===//
+// Real target instructions, move this to the appropriate subtarget TD file
+//===----------------------------------------------------------------------===//
+
+class Select_si<string opName> :
+ SIMCInstr<opName, SIEncodingFamily.SI> {
+ list<Predicate> AssemblerPredicates = [isSICI];
+ string DecoderNamespace = "SICI";
+}
+
+class SOP1_Real_si<bits<8> op, SOP1_Pseudo ps> :
+ SOP1_Real<op, ps>,
+ Select_si<ps.Mnemonic>;
+
+class SOP2_Real_si<bits<7> op, SOP2_Pseudo ps> :
+ SOP2_Real<op, ps>,
+ Select_si<ps.Mnemonic>;
+
+class SOPK_Real_si<bits<5> op, SOPK_Pseudo ps> :
+ SOPK_Real32<op, ps>,
+ Select_si<ps.Mnemonic>;
+
+def S_MOV_B32_si : SOP1_Real_si <0x03, S_MOV_B32>;
+def S_MOV_B64_si : SOP1_Real_si <0x04, S_MOV_B64>;
+def S_CMOV_B32_si : SOP1_Real_si <0x05, S_CMOV_B32>;
+def S_CMOV_B64_si : SOP1_Real_si <0x06, S_CMOV_B64>;
+def S_NOT_B32_si : SOP1_Real_si <0x07, S_NOT_B32>;
+def S_NOT_B64_si : SOP1_Real_si <0x08, S_NOT_B64>;
+def S_WQM_B32_si : SOP1_Real_si <0x09, S_WQM_B32>;
+def S_WQM_B64_si : SOP1_Real_si <0x0a, S_WQM_B64>;
+def S_BREV_B32_si : SOP1_Real_si <0x0b, S_BREV_B32>;
+def S_BREV_B64_si : SOP1_Real_si <0x0c, S_BREV_B64>;
+def S_BCNT0_I32_B32_si : SOP1_Real_si <0x0d, S_BCNT0_I32_B32>;
+def S_BCNT0_I32_B64_si : SOP1_Real_si <0x0e, S_BCNT0_I32_B64>;
+def S_BCNT1_I32_B32_si : SOP1_Real_si <0x0f, S_BCNT1_I32_B32>;
+def S_BCNT1_I32_B64_si : SOP1_Real_si <0x10, S_BCNT1_I32_B64>;
+def S_FF0_I32_B32_si : SOP1_Real_si <0x11, S_FF0_I32_B32>;
+def S_FF0_I32_B64_si : SOP1_Real_si <0x12, S_FF0_I32_B64>;
+def S_FF1_I32_B32_si : SOP1_Real_si <0x13, S_FF1_I32_B32>;
+def S_FF1_I32_B64_si : SOP1_Real_si <0x14, S_FF1_I32_B64>;
+def S_FLBIT_I32_B32_si : SOP1_Real_si <0x15, S_FLBIT_I32_B32>;
+def S_FLBIT_I32_B64_si : SOP1_Real_si <0x16, S_FLBIT_I32_B64>;
+def S_FLBIT_I32_si : SOP1_Real_si <0x17, S_FLBIT_I32>;
+def S_FLBIT_I32_I64_si : SOP1_Real_si <0x18, S_FLBIT_I32_I64>;
+def S_SEXT_I32_I8_si : SOP1_Real_si <0x19, S_SEXT_I32_I8>;
+def S_SEXT_I32_I16_si : SOP1_Real_si <0x1a, S_SEXT_I32_I16>;
+def S_BITSET0_B32_si : SOP1_Real_si <0x1b, S_BITSET0_B32>;
+def S_BITSET0_B64_si : SOP1_Real_si <0x1c, S_BITSET0_B64>;
+def S_BITSET1_B32_si : SOP1_Real_si <0x1d, S_BITSET1_B32>;
+def S_BITSET1_B64_si : SOP1_Real_si <0x1e, S_BITSET1_B64>;
+def S_GETPC_B64_si : SOP1_Real_si <0x1f, S_GETPC_B64>;
+def S_SETPC_B64_si : SOP1_Real_si <0x20, S_SETPC_B64>;
+def S_SWAPPC_B64_si : SOP1_Real_si <0x21, S_SWAPPC_B64>;
+def S_RFE_B64_si : SOP1_Real_si <0x22, S_RFE_B64>;
+def S_AND_SAVEEXEC_B64_si : SOP1_Real_si <0x24, S_AND_SAVEEXEC_B64>;
+def S_OR_SAVEEXEC_B64_si : SOP1_Real_si <0x25, S_OR_SAVEEXEC_B64>;
+def S_XOR_SAVEEXEC_B64_si : SOP1_Real_si <0x26, S_XOR_SAVEEXEC_B64>;
+def S_ANDN2_SAVEEXEC_B64_si: SOP1_Real_si <0x27, S_ANDN2_SAVEEXEC_B64>;
+def S_ORN2_SAVEEXEC_B64_si : SOP1_Real_si <0x28, S_ORN2_SAVEEXEC_B64>;
+def S_NAND_SAVEEXEC_B64_si : SOP1_Real_si <0x29, S_NAND_SAVEEXEC_B64>;
+def S_NOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2a, S_NOR_SAVEEXEC_B64>;
+def S_XNOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2b, S_XNOR_SAVEEXEC_B64>;
+def S_QUADMASK_B32_si : SOP1_Real_si <0x2c, S_QUADMASK_B32>;
+def S_QUADMASK_B64_si : SOP1_Real_si <0x2d, S_QUADMASK_B64>;
+def S_MOVRELS_B32_si : SOP1_Real_si <0x2e, S_MOVRELS_B32>;
+def S_MOVRELS_B64_si : SOP1_Real_si <0x2f, S_MOVRELS_B64>;
+def S_MOVRELD_B32_si : SOP1_Real_si <0x30, S_MOVRELD_B32>;
+def S_MOVRELD_B64_si : SOP1_Real_si <0x31, S_MOVRELD_B64>;
+def S_CBRANCH_JOIN_si : SOP1_Real_si <0x32, S_CBRANCH_JOIN>;
+def S_MOV_REGRD_B32_si : SOP1_Real_si <0x33, S_MOV_REGRD_B32>;
+def S_ABS_I32_si : SOP1_Real_si <0x34, S_ABS_I32>;
+def S_MOV_FED_B32_si : SOP1_Real_si <0x35, S_MOV_FED_B32>;
+
+def S_ADD_U32_si : SOP2_Real_si <0x00, S_ADD_U32>;
+def S_ADD_I32_si : SOP2_Real_si <0x02, S_ADD_I32>;
+def S_SUB_U32_si : SOP2_Real_si <0x01, S_SUB_U32>;
+def S_SUB_I32_si : SOP2_Real_si <0x03, S_SUB_I32>;
+def S_ADDC_U32_si : SOP2_Real_si <0x04, S_ADDC_U32>;
+def S_SUBB_U32_si : SOP2_Real_si <0x05, S_SUBB_U32>;
+def S_MIN_I32_si : SOP2_Real_si <0x06, S_MIN_I32>;
+def S_MIN_U32_si : SOP2_Real_si <0x07, S_MIN_U32>;
+def S_MAX_I32_si : SOP2_Real_si <0x08, S_MAX_I32>;
+def S_MAX_U32_si : SOP2_Real_si <0x09, S_MAX_U32>;
+def S_CSELECT_B32_si : SOP2_Real_si <0x0a, S_CSELECT_B32>;
+def S_CSELECT_B64_si : SOP2_Real_si <0x0b, S_CSELECT_B64>;
+def S_AND_B32_si : SOP2_Real_si <0x0e, S_AND_B32>;
+def S_AND_B64_si : SOP2_Real_si <0x0f, S_AND_B64>;
+def S_OR_B32_si : SOP2_Real_si <0x10, S_OR_B32>;
+def S_OR_B64_si : SOP2_Real_si <0x11, S_OR_B64>;
+def S_XOR_B32_si : SOP2_Real_si <0x12, S_XOR_B32>;
+def S_XOR_B64_si : SOP2_Real_si <0x13, S_XOR_B64>;
+def S_ANDN2_B32_si : SOP2_Real_si <0x14, S_ANDN2_B32>;
+def S_ANDN2_B64_si : SOP2_Real_si <0x15, S_ANDN2_B64>;
+def S_ORN2_B32_si : SOP2_Real_si <0x16, S_ORN2_B32>;
+def S_ORN2_B64_si : SOP2_Real_si <0x17, S_ORN2_B64>;
+def S_NAND_B32_si : SOP2_Real_si <0x18, S_NAND_B32>;
+def S_NAND_B64_si : SOP2_Real_si <0x19, S_NAND_B64>;
+def S_NOR_B32_si : SOP2_Real_si <0x1a, S_NOR_B32>;
+def S_NOR_B64_si : SOP2_Real_si <0x1b, S_NOR_B64>;
+def S_XNOR_B32_si : SOP2_Real_si <0x1c, S_XNOR_B32>;
+def S_XNOR_B64_si : SOP2_Real_si <0x1d, S_XNOR_B64>;
+def S_LSHL_B32_si : SOP2_Real_si <0x1e, S_LSHL_B32>;
+def S_LSHL_B64_si : SOP2_Real_si <0x1f, S_LSHL_B64>;
+def S_LSHR_B32_si : SOP2_Real_si <0x20, S_LSHR_B32>;
+def S_LSHR_B64_si : SOP2_Real_si <0x21, S_LSHR_B64>;
+def S_ASHR_I32_si : SOP2_Real_si <0x22, S_ASHR_I32>;
+def S_ASHR_I64_si : SOP2_Real_si <0x23, S_ASHR_I64>;
+def S_BFM_B32_si : SOP2_Real_si <0x24, S_BFM_B32>;
+def S_BFM_B64_si : SOP2_Real_si <0x25, S_BFM_B64>;
+def S_MUL_I32_si : SOP2_Real_si <0x26, S_MUL_I32>;
+def S_BFE_U32_si : SOP2_Real_si <0x27, S_BFE_U32>;
+def S_BFE_I32_si : SOP2_Real_si <0x28, S_BFE_I32>;
+def S_BFE_U64_si : SOP2_Real_si <0x29, S_BFE_U64>;
+def S_BFE_I64_si : SOP2_Real_si <0x2a, S_BFE_I64>;
+def S_CBRANCH_G_FORK_si : SOP2_Real_si <0x2b, S_CBRANCH_G_FORK>;
+def S_ABSDIFF_I32_si : SOP2_Real_si <0x2c, S_ABSDIFF_I32>;
+
+def S_MOVK_I32_si : SOPK_Real_si <0x00, S_MOVK_I32>;
+def S_CMOVK_I32_si : SOPK_Real_si <0x02, S_CMOVK_I32>;
+def S_CMPK_EQ_I32_si : SOPK_Real_si <0x03, S_CMPK_EQ_I32>;
+def S_CMPK_LG_I32_si : SOPK_Real_si <0x04, S_CMPK_LG_I32>;
+def S_CMPK_GT_I32_si : SOPK_Real_si <0x05, S_CMPK_GT_I32>;
+def S_CMPK_GE_I32_si : SOPK_Real_si <0x06, S_CMPK_GE_I32>;
+def S_CMPK_LT_I32_si : SOPK_Real_si <0x07, S_CMPK_LT_I32>;
+def S_CMPK_LE_I32_si : SOPK_Real_si <0x08, S_CMPK_LE_I32>;
+def S_CMPK_EQ_U32_si : SOPK_Real_si <0x09, S_CMPK_EQ_U32>;
+def S_CMPK_LG_U32_si : SOPK_Real_si <0x0a, S_CMPK_LG_U32>;
+def S_CMPK_GT_U32_si : SOPK_Real_si <0x0b, S_CMPK_GT_U32>;
+def S_CMPK_GE_U32_si : SOPK_Real_si <0x0c, S_CMPK_GE_U32>;
+def S_CMPK_LT_U32_si : SOPK_Real_si <0x0d, S_CMPK_LT_U32>;
+def S_CMPK_LE_U32_si : SOPK_Real_si <0x0e, S_CMPK_LE_U32>;
+def S_ADDK_I32_si : SOPK_Real_si <0x0f, S_ADDK_I32>;
+def S_MULK_I32_si : SOPK_Real_si <0x10, S_MULK_I32>;
+def S_CBRANCH_I_FORK_si : SOPK_Real_si <0x11, S_CBRANCH_I_FORK>;
+def S_GETREG_B32_si : SOPK_Real_si <0x12, S_GETREG_B32>;
+def S_SETREG_B32_si : SOPK_Real_si <0x13, S_SETREG_B32>;
+//def S_GETREG_REGRD_B32_si : SOPK_Real_si <0x14, S_GETREG_REGRD_B32>; // see pseudo for comments
+def S_SETREG_IMM32_B32_si : SOPK_Real64<0x15, S_SETREG_IMM32_B32>,
+ Select_si<S_SETREG_IMM32_B32.Mnemonic>;
+
+
+class Select_vi<string opName> :
+ SIMCInstr<opName, SIEncodingFamily.VI> {
+ list<Predicate> AssemblerPredicates = [isVI];
+ string DecoderNamespace = "VI";
+}
+
+class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> :
+ SOP1_Real<op, ps>,
+ Select_vi<ps.Mnemonic>;
+
+
+class SOP2_Real_vi<bits<7> op, SOP2_Pseudo ps> :
+ SOP2_Real<op, ps>,
+ Select_vi<ps.Mnemonic>;
+
+class SOPK_Real_vi<bits<5> op, SOPK_Pseudo ps> :
+ SOPK_Real32<op, ps>,
+ Select_vi<ps.Mnemonic>;
+
+def S_MOV_B32_vi : SOP1_Real_vi <0x00, S_MOV_B32>;
+def S_MOV_B64_vi : SOP1_Real_vi <0x01, S_MOV_B64>;
+def S_CMOV_B32_vi : SOP1_Real_vi <0x02, S_CMOV_B32>;
+def S_CMOV_B64_vi : SOP1_Real_vi <0x03, S_CMOV_B64>;
+def S_NOT_B32_vi : SOP1_Real_vi <0x04, S_NOT_B32>;
+def S_NOT_B64_vi : SOP1_Real_vi <0x05, S_NOT_B64>;
+def S_WQM_B32_vi : SOP1_Real_vi <0x06, S_WQM_B32>;
+def S_WQM_B64_vi : SOP1_Real_vi <0x07, S_WQM_B64>;
+def S_BREV_B32_vi : SOP1_Real_vi <0x08, S_BREV_B32>;
+def S_BREV_B64_vi : SOP1_Real_vi <0x09, S_BREV_B64>;
+def S_BCNT0_I32_B32_vi : SOP1_Real_vi <0x0a, S_BCNT0_I32_B32>;
+def S_BCNT0_I32_B64_vi : SOP1_Real_vi <0x0b, S_BCNT0_I32_B64>;
+def S_BCNT1_I32_B32_vi : SOP1_Real_vi <0x0c, S_BCNT1_I32_B32>;
+def S_BCNT1_I32_B64_vi : SOP1_Real_vi <0x0d, S_BCNT1_I32_B64>;
+def S_FF0_I32_B32_vi : SOP1_Real_vi <0x0e, S_FF0_I32_B32>;
+def S_FF0_I32_B64_vi : SOP1_Real_vi <0x0f, S_FF0_I32_B64>;
+def S_FF1_I32_B32_vi : SOP1_Real_vi <0x10, S_FF1_I32_B32>;
+def S_FF1_I32_B64_vi : SOP1_Real_vi <0x11, S_FF1_I32_B64>;
+def S_FLBIT_I32_B32_vi : SOP1_Real_vi <0x12, S_FLBIT_I32_B32>;
+def S_FLBIT_I32_B64_vi : SOP1_Real_vi <0x13, S_FLBIT_I32_B64>;
+def S_FLBIT_I32_vi : SOP1_Real_vi <0x14, S_FLBIT_I32>;
+def S_FLBIT_I32_I64_vi : SOP1_Real_vi <0x15, S_FLBIT_I32_I64>;
+def S_SEXT_I32_I8_vi : SOP1_Real_vi <0x16, S_SEXT_I32_I8>;
+def S_SEXT_I32_I16_vi : SOP1_Real_vi <0x17, S_SEXT_I32_I16>;
+def S_BITSET0_B32_vi : SOP1_Real_vi <0x18, S_BITSET0_B32>;
+def S_BITSET0_B64_vi : SOP1_Real_vi <0x19, S_BITSET0_B64>;
+def S_BITSET1_B32_vi : SOP1_Real_vi <0x1a, S_BITSET1_B32>;
+def S_BITSET1_B64_vi : SOP1_Real_vi <0x1b, S_BITSET1_B64>;
+def S_GETPC_B64_vi : SOP1_Real_vi <0x1c, S_GETPC_B64>;
+def S_SETPC_B64_vi : SOP1_Real_vi <0x1d, S_SETPC_B64>;
+def S_SWAPPC_B64_vi : SOP1_Real_vi <0x1e, S_SWAPPC_B64>;
+def S_RFE_B64_vi : SOP1_Real_vi <0x1f, S_RFE_B64>;
+def S_AND_SAVEEXEC_B64_vi : SOP1_Real_vi <0x20, S_AND_SAVEEXEC_B64>;
+def S_OR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x21, S_OR_SAVEEXEC_B64>;
+def S_XOR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x22, S_XOR_SAVEEXEC_B64>;
+def S_ANDN2_SAVEEXEC_B64_vi: SOP1_Real_vi <0x23, S_ANDN2_SAVEEXEC_B64>;
+def S_ORN2_SAVEEXEC_B64_vi : SOP1_Real_vi <0x24, S_ORN2_SAVEEXEC_B64>;
+def S_NAND_SAVEEXEC_B64_vi : SOP1_Real_vi <0x25, S_NAND_SAVEEXEC_B64>;
+def S_NOR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x26, S_NOR_SAVEEXEC_B64>;
+def S_XNOR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x27, S_XNOR_SAVEEXEC_B64>;
+def S_QUADMASK_B32_vi : SOP1_Real_vi <0x28, S_QUADMASK_B32>;
+def S_QUADMASK_B64_vi : SOP1_Real_vi <0x29, S_QUADMASK_B64>;
+def S_MOVRELS_B32_vi : SOP1_Real_vi <0x2a, S_MOVRELS_B32>;
+def S_MOVRELS_B64_vi : SOP1_Real_vi <0x2b, S_MOVRELS_B64>;
+def S_MOVRELD_B32_vi : SOP1_Real_vi <0x2c, S_MOVRELD_B32>;
+def S_MOVRELD_B64_vi : SOP1_Real_vi <0x2d, S_MOVRELD_B64>;
+def S_CBRANCH_JOIN_vi : SOP1_Real_vi <0x2e, S_CBRANCH_JOIN>;
+def S_MOV_REGRD_B32_vi : SOP1_Real_vi <0x2f, S_MOV_REGRD_B32>;
+def S_ABS_I32_vi : SOP1_Real_vi <0x30, S_ABS_I32>;
+def S_MOV_FED_B32_vi : SOP1_Real_vi <0x31, S_MOV_FED_B32>;
+def S_SET_GPR_IDX_IDX_vi : SOP1_Real_vi <0x32, S_SET_GPR_IDX_IDX>;
+
+def S_ADD_U32_vi : SOP2_Real_vi <0x00, S_ADD_U32>;
+def S_ADD_I32_vi : SOP2_Real_vi <0x02, S_ADD_I32>;
+def S_SUB_U32_vi : SOP2_Real_vi <0x01, S_SUB_U32>;
+def S_SUB_I32_vi : SOP2_Real_vi <0x03, S_SUB_I32>;
+def S_ADDC_U32_vi : SOP2_Real_vi <0x04, S_ADDC_U32>;
+def S_SUBB_U32_vi : SOP2_Real_vi <0x05, S_SUBB_U32>;
+def S_MIN_I32_vi : SOP2_Real_vi <0x06, S_MIN_I32>;
+def S_MIN_U32_vi : SOP2_Real_vi <0x07, S_MIN_U32>;
+def S_MAX_I32_vi : SOP2_Real_vi <0x08, S_MAX_I32>;
+def S_MAX_U32_vi : SOP2_Real_vi <0x09, S_MAX_U32>;
+def S_CSELECT_B32_vi : SOP2_Real_vi <0x0a, S_CSELECT_B32>;
+def S_CSELECT_B64_vi : SOP2_Real_vi <0x0b, S_CSELECT_B64>;
+def S_AND_B32_vi : SOP2_Real_vi <0x0c, S_AND_B32>;
+def S_AND_B64_vi : SOP2_Real_vi <0x0d, S_AND_B64>;
+def S_OR_B32_vi : SOP2_Real_vi <0x0e, S_OR_B32>;
+def S_OR_B64_vi : SOP2_Real_vi <0x0f, S_OR_B64>;
+def S_XOR_B32_vi : SOP2_Real_vi <0x10, S_XOR_B32>;
+def S_XOR_B64_vi : SOP2_Real_vi <0x11, S_XOR_B64>;
+def S_ANDN2_B32_vi : SOP2_Real_vi <0x12, S_ANDN2_B32>;
+def S_ANDN2_B64_vi : SOP2_Real_vi <0x13, S_ANDN2_B64>;
+def S_ORN2_B32_vi : SOP2_Real_vi <0x14, S_ORN2_B32>;
+def S_ORN2_B64_vi : SOP2_Real_vi <0x15, S_ORN2_B64>;
+def S_NAND_B32_vi : SOP2_Real_vi <0x16, S_NAND_B32>;
+def S_NAND_B64_vi : SOP2_Real_vi <0x17, S_NAND_B64>;
+def S_NOR_B32_vi : SOP2_Real_vi <0x18, S_NOR_B32>;
+def S_NOR_B64_vi : SOP2_Real_vi <0x19, S_NOR_B64>;
+def S_XNOR_B32_vi : SOP2_Real_vi <0x1a, S_XNOR_B32>;
+def S_XNOR_B64_vi : SOP2_Real_vi <0x1b, S_XNOR_B64>;
+def S_LSHL_B32_vi : SOP2_Real_vi <0x1c, S_LSHL_B32>;
+def S_LSHL_B64_vi : SOP2_Real_vi <0x1d, S_LSHL_B64>;
+def S_LSHR_B32_vi : SOP2_Real_vi <0x1e, S_LSHR_B32>;
+def S_LSHR_B64_vi : SOP2_Real_vi <0x1f, S_LSHR_B64>;
+def S_ASHR_I32_vi : SOP2_Real_vi <0x20, S_ASHR_I32>;
+def S_ASHR_I64_vi : SOP2_Real_vi <0x21, S_ASHR_I64>;
+def S_BFM_B32_vi : SOP2_Real_vi <0x22, S_BFM_B32>;
+def S_BFM_B64_vi : SOP2_Real_vi <0x23, S_BFM_B64>;
+def S_MUL_I32_vi : SOP2_Real_vi <0x24, S_MUL_I32>;
+def S_BFE_U32_vi : SOP2_Real_vi <0x25, S_BFE_U32>;
+def S_BFE_I32_vi : SOP2_Real_vi <0x26, S_BFE_I32>;
+def S_BFE_U64_vi : SOP2_Real_vi <0x27, S_BFE_U64>;
+def S_BFE_I64_vi : SOP2_Real_vi <0x28, S_BFE_I64>;
+def S_CBRANCH_G_FORK_vi : SOP2_Real_vi <0x29, S_CBRANCH_G_FORK>;
+def S_ABSDIFF_I32_vi : SOP2_Real_vi <0x2a, S_ABSDIFF_I32>;
+
+def S_MOVK_I32_vi : SOPK_Real_vi <0x00, S_MOVK_I32>;
+def S_CMOVK_I32_vi : SOPK_Real_vi <0x01, S_CMOVK_I32>;
+def S_CMPK_EQ_I32_vi : SOPK_Real_vi <0x02, S_CMPK_EQ_I32>;
+def S_CMPK_LG_I32_vi : SOPK_Real_vi <0x03, S_CMPK_LG_I32>;
+def S_CMPK_GT_I32_vi : SOPK_Real_vi <0x04, S_CMPK_GT_I32>;
+def S_CMPK_GE_I32_vi : SOPK_Real_vi <0x05, S_CMPK_GE_I32>;
+def S_CMPK_LT_I32_vi : SOPK_Real_vi <0x06, S_CMPK_LT_I32>;
+def S_CMPK_LE_I32_vi : SOPK_Real_vi <0x07, S_CMPK_LE_I32>;
+def S_CMPK_EQ_U32_vi : SOPK_Real_vi <0x08, S_CMPK_EQ_U32>;
+def S_CMPK_LG_U32_vi : SOPK_Real_vi <0x09, S_CMPK_LG_U32>;
+def S_CMPK_GT_U32_vi : SOPK_Real_vi <0x0A, S_CMPK_GT_U32>;
+def S_CMPK_GE_U32_vi : SOPK_Real_vi <0x0B, S_CMPK_GE_U32>;
+def S_CMPK_LT_U32_vi : SOPK_Real_vi <0x0C, S_CMPK_LT_U32>;
+def S_CMPK_LE_U32_vi : SOPK_Real_vi <0x0D, S_CMPK_LE_U32>;
+def S_ADDK_I32_vi : SOPK_Real_vi <0x0E, S_ADDK_I32>;
+def S_MULK_I32_vi : SOPK_Real_vi <0x0F, S_MULK_I32>;
+def S_CBRANCH_I_FORK_vi : SOPK_Real_vi <0x10, S_CBRANCH_I_FORK>;
+def S_GETREG_B32_vi : SOPK_Real_vi <0x11, S_GETREG_B32>;
+def S_SETREG_B32_vi : SOPK_Real_vi <0x12, S_SETREG_B32>;
+//def S_GETREG_REGRD_B32_vi : SOPK_Real_vi <0x13, S_GETREG_REGRD_B32>; // see pseudo for comments
+def S_SETREG_IMM32_B32_vi : SOPK_Real64<0x14, S_SETREG_IMM32_B32>,
+ Select_vi<S_SETREG_IMM32_B32.Mnemonic>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index 2112135..9908fc0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -18,13 +18,20 @@ using namespace llvm;
/// \brief The target which suports all AMD GPUs. This will eventually
/// be deprecated and there will be a R600 target and a GCN target.
-Target llvm::TheAMDGPUTarget;
+Target &llvm::getTheAMDGPUTarget() {
+ static Target TheAMDGPUTarget;
+ return TheAMDGPUTarget;
+}
/// \brief The target for GCN GPUs
-Target llvm::TheGCNTarget;
+Target &llvm::getTheGCNTarget() {
+ static Target TheGCNTarget;
+ return TheGCNTarget;
+}
/// \brief Extern function to initialize the targets for the AMDGPU backend
extern "C" void LLVMInitializeAMDGPUTargetInfo() {
- RegisterTarget<Triple::r600, false>
- R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
- RegisterTarget<Triple::amdgcn, false> GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs");
+ RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600",
+ "AMD GPUs HD2XXX-HD6XXX");
+ RegisterTarget<Triple::amdgcn, false> GCN(getTheGCNTarget(), "amdgcn",
+ "AMD GCN GPUs");
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c6f9142..5f651d4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -8,10 +8,13 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUBaseInfo.h"
#include "AMDGPU.h"
+#include "SIDefines.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/SubtargetFeature.h"
@@ -24,6 +27,55 @@
#include "AMDGPUGenRegisterInfo.inc"
#undef GET_REGINFO_ENUM
+#define GET_INSTRINFO_NAMED_OPS
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+#undef GET_INSTRINFO_NAMED_OPS
+#undef GET_INSTRINFO_ENUM
+
+namespace {
+
+/// \returns Bit mask for given bit \p Shift and bit \p Width.
+unsigned getBitMask(unsigned Shift, unsigned Width) {
+ return ((1 << Width) - 1) << Shift;
+}
+
+/// \brief Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
+///
+/// \returns Packed \p Dst.
+unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
+ Dst &= ~(1 << Shift) & ~getBitMask(Shift, Width);
+ Dst |= (Src << Shift) & getBitMask(Shift, Width);
+ return Dst;
+}
+
+/// \brief Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
+///
+/// \returns Unpacked bits.
+unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
+ return (Src & getBitMask(Shift, Width)) >> Shift;
+}
+
+/// \returns Vmcnt bit shift.
+unsigned getVmcntBitShift() { return 0; }
+
+/// \returns Vmcnt bit width.
+unsigned getVmcntBitWidth() { return 4; }
+
+/// \returns Expcnt bit shift.
+unsigned getExpcntBitShift() { return 4; }
+
+/// \returns Expcnt bit width.
+unsigned getExpcntBitWidth() { return 3; }
+
+/// \returns Lgkmcnt bit shift.
+unsigned getLgkmcntBitShift() { return 8; }
+
+/// \returns Lgkmcnt bit width.
+unsigned getLgkmcntBitWidth() { return 4; }
+
+} // anonymous namespace
+
namespace llvm {
namespace AMDGPU {
@@ -35,15 +87,27 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
if (Features.test(FeatureISAVersion7_0_1))
return {7, 0, 1};
+ if (Features.test(FeatureISAVersion7_0_2))
+ return {7, 0, 2};
+
if (Features.test(FeatureISAVersion8_0_0))
return {8, 0, 0};
if (Features.test(FeatureISAVersion8_0_1))
return {8, 0, 1};
+ if (Features.test(FeatureISAVersion8_0_2))
+ return {8, 0, 2};
+
if (Features.test(FeatureISAVersion8_0_3))
return {8, 0, 3};
+ if (Features.test(FeatureISAVersion8_0_4))
+ return {8, 0, 4};
+
+ if (Features.test(FeatureISAVersion8_1_0))
+ return {8, 1, 0};
+
return {0, 0, 0};
}
@@ -109,6 +173,10 @@ bool isReadOnlySegment(const GlobalValue *GV) {
return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
}
+bool shouldEmitConstantsToTextSection(const Triple &TT) {
+ return TT.getOS() != Triple::AMDHSA;
+}
+
int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
Attribute A = F.getFnAttribute(Name);
int Result = Default;
@@ -124,8 +192,88 @@ int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
return Result;
}
-unsigned getMaximumWorkGroupSize(const Function &F) {
- return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256);
+std::pair<int, int> getIntegerPairAttribute(const Function &F,
+ StringRef Name,
+ std::pair<int, int> Default,
+ bool OnlyFirstRequired) {
+ Attribute A = F.getFnAttribute(Name);
+ if (!A.isStringAttribute())
+ return Default;
+
+ LLVMContext &Ctx = F.getContext();
+ std::pair<int, int> Ints = Default;
+ std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(',');
+ if (Strs.first.trim().getAsInteger(0, Ints.first)) {
+ Ctx.emitError("can't parse first integer attribute " + Name);
+ return Default;
+ }
+ if (Strs.second.trim().getAsInteger(0, Ints.second)) {
+ if (!OnlyFirstRequired || Strs.second.trim().size()) {
+ Ctx.emitError("can't parse second integer attribute " + Name);
+ return Default;
+ }
+ }
+
+ return Ints;
+}
+
+unsigned getWaitcntBitMask(IsaVersion Version) {
+ unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth());
+ unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
+ unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
+ return Vmcnt | Expcnt | Lgkmcnt;
+}
+
+unsigned getVmcntBitMask(IsaVersion Version) {
+ return (1 << getVmcntBitWidth()) - 1;
+}
+
+unsigned getExpcntBitMask(IsaVersion Version) {
+ return (1 << getExpcntBitWidth()) - 1;
+}
+
+unsigned getLgkmcntBitMask(IsaVersion Version) {
+ return (1 << getLgkmcntBitWidth()) - 1;
+}
+
+unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt) {
+ return unpackBits(Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
+}
+
+unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt) {
+ return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
+}
+
+unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt) {
+ return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
+}
+
+void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt,
+ unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) {
+ Vmcnt = decodeVmcnt(Version, Waitcnt);
+ Expcnt = decodeExpcnt(Version, Waitcnt);
+ Lgkmcnt = decodeLgkmcnt(Version, Waitcnt);
+}
+
+unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt) {
+ return packBits(Vmcnt, Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
+}
+
+unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt) {
+ return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
+}
+
+unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) {
+ return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
+}
+
+unsigned encodeWaitcnt(IsaVersion Version,
+ unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) {
+ unsigned Waitcnt = getWaitcntBitMask(Version);
+ Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt);
+ Waitcnt = encodeExpcnt(Version, Waitcnt, Expcnt);
+ Waitcnt = encodeLgkmcnt(Version, Waitcnt, Lgkmcnt);
+ return Waitcnt;
}
unsigned getInitialPSInputAddr(const Function &F) {
@@ -179,5 +327,135 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
return Reg;
}
+bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+ unsigned OpType = Desc.OpInfo[OpNo].OperandType;
+ return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
+ OpType <= AMDGPU::OPERAND_SRC_LAST;
+}
+
+bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+ unsigned OpType = Desc.OpInfo[OpNo].OperandType;
+ switch (OpType) {
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+ unsigned OpType = Desc.OpInfo[OpNo].OperandType;
+ return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
+ OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST;
+}
+
+// Avoid using MCRegisterClass::getSize, since that function will go away
+// (move from MC* level to Target* level). Return size in bits.
+unsigned getRegBitWidth(unsigned RCID) {
+ switch (RCID) {
+ case AMDGPU::SGPR_32RegClassID:
+ case AMDGPU::VGPR_32RegClassID:
+ case AMDGPU::VS_32RegClassID:
+ case AMDGPU::SReg_32RegClassID:
+ case AMDGPU::SReg_32_XM0RegClassID:
+ return 32;
+ case AMDGPU::SGPR_64RegClassID:
+ case AMDGPU::VS_64RegClassID:
+ case AMDGPU::SReg_64RegClassID:
+ case AMDGPU::VReg_64RegClassID:
+ return 64;
+ case AMDGPU::VReg_96RegClassID:
+ return 96;
+ case AMDGPU::SGPR_128RegClassID:
+ case AMDGPU::SReg_128RegClassID:
+ case AMDGPU::VReg_128RegClassID:
+ return 128;
+ case AMDGPU::SReg_256RegClassID:
+ case AMDGPU::VReg_256RegClassID:
+ return 256;
+ case AMDGPU::SReg_512RegClassID:
+ case AMDGPU::VReg_512RegClassID:
+ return 512;
+ default:
+ llvm_unreachable("Unexpected register class");
+ }
+}
+
+unsigned getRegBitWidth(const MCRegisterClass &RC) {
+ return getRegBitWidth(RC.getID());
+}
+
+unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
+ unsigned OpNo) {
+ unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+ return getRegBitWidth(MRI->getRegClass(RCID)) / 8;
+}
+
+bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
+ if (Literal >= -16 && Literal <= 64)
+ return true;
+
+ uint64_t Val = static_cast<uint64_t>(Literal);
+ return (Val == DoubleToBits(0.0)) ||
+ (Val == DoubleToBits(1.0)) ||
+ (Val == DoubleToBits(-1.0)) ||
+ (Val == DoubleToBits(0.5)) ||
+ (Val == DoubleToBits(-0.5)) ||
+ (Val == DoubleToBits(2.0)) ||
+ (Val == DoubleToBits(-2.0)) ||
+ (Val == DoubleToBits(4.0)) ||
+ (Val == DoubleToBits(-4.0)) ||
+ (Val == 0x3fc45f306dc9c882 && HasInv2Pi);
+}
+
+bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
+ if (Literal >= -16 && Literal <= 64)
+ return true;
+
+ // The actual type of the operand does not seem to matter as long
+ // as the bits match one of the inline immediate values. For example:
+ //
+ // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
+ // so it is a legal inline immediate.
+ //
+ // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
+ // floating-point, so it is a legal inline immediate.
+
+ uint32_t Val = static_cast<uint32_t>(Literal);
+ return (Val == FloatToBits(0.0f)) ||
+ (Val == FloatToBits(1.0f)) ||
+ (Val == FloatToBits(-1.0f)) ||
+ (Val == FloatToBits(0.5f)) ||
+ (Val == FloatToBits(-0.5f)) ||
+ (Val == FloatToBits(2.0f)) ||
+ (Val == FloatToBits(-2.0f)) ||
+ (Val == FloatToBits(4.0f)) ||
+ (Val == FloatToBits(-4.0f)) ||
+ (Val == 0x3e22f983 && HasInv2Pi);
+}
+
+bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
+ assert(HasInv2Pi);
+
+ if (Literal >= -16 && Literal <= 64)
+ return true;
+
+ uint16_t Val = static_cast<uint16_t>(Literal);
+ return Val == 0x3C00 || // 1.0
+ Val == 0xBC00 || // -1.0
+ Val == 0x3800 || // 0.5
+ Val == 0xB800 || // -0.5
+ Val == 0x4000 || // 2.0
+ Val == 0xC000 || // -2.0
+ Val == 0x4400 || // 4.0
+ Val == 0xC400 || // -4.0
+ Val == 0x3118; // 1/2pi
+}
+
} // End namespace AMDGPU
} // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 995a904..ea5fc36 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -13,17 +13,29 @@
#include "AMDKernelCodeT.h"
#include "llvm/IR/CallingConv.h"
+#include "SIDefines.h"
+
+#define GET_INSTRINFO_OPERAND_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+#undef GET_INSTRINFO_OPERAND_ENUM
+
namespace llvm {
class FeatureBitset;
class Function;
class GlobalValue;
class MCContext;
+class MCInstrDesc;
+class MCRegisterClass;
+class MCRegisterInfo;
class MCSection;
class MCSubtargetInfo;
namespace AMDGPU {
+LLVM_READONLY
+int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+
struct IsaVersion {
unsigned Major;
unsigned Minor;
@@ -45,9 +57,86 @@ bool isGroupSegment(const GlobalValue *GV);
bool isGlobalSegment(const GlobalValue *GV);
bool isReadOnlySegment(const GlobalValue *GV);
+/// \returns True if constants should be emitted to .text section for given
+/// target triple \p TT, false otherwise.
+bool shouldEmitConstantsToTextSection(const Triple &TT);
+
+/// \returns Integer value requested using \p F's \p Name attribute.
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if requested value cannot be converted
+/// to integer.
int getIntegerAttribute(const Function &F, StringRef Name, int Default);
-unsigned getMaximumWorkGroupSize(const Function &F);
+/// \returns A pair of integer values requested using \p F's \p Name attribute
+/// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
+/// is false).
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if one of the requested values cannot be
+/// converted to integer, or \p OnlyFirstRequired is false and "second" value is
+/// not present.
+std::pair<int, int> getIntegerPairAttribute(const Function &F,
+ StringRef Name,
+ std::pair<int, int> Default,
+ bool OnlyFirstRequired = false);
+
+/// \returns Waitcnt bit mask for given isa \p Version.
+unsigned getWaitcntBitMask(IsaVersion Version);
+
+/// \returns Vmcnt bit mask for given isa \p Version.
+unsigned getVmcntBitMask(IsaVersion Version);
+
+/// \returns Expcnt bit mask for given isa \p Version.
+unsigned getExpcntBitMask(IsaVersion Version);
+
+/// \returns Lgkmcnt bit mask for given isa \p Version.
+unsigned getLgkmcntBitMask(IsaVersion Version);
+
+/// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version.
+unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt);
+
+/// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version.
+unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt);
+
+/// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
+unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt);
+
+/// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
+/// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
+/// \p Lgkmcnt respectively.
+///
+/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
+/// \p Vmcnt = \p Waitcnt[3:0]
+/// \p Expcnt = \p Waitcnt[6:4]
+/// \p Lgkmcnt = \p Waitcnt[11:8]
+void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt,
+ unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
+
+/// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version.
+unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt);
+
+/// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version.
+unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt);
+
+/// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version.
+unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt);
+
+/// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
+/// \p Version.
+///
+/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
+/// Waitcnt[3:0] = \p Vmcnt
+/// Waitcnt[6:4] = \p Expcnt
+/// Waitcnt[11:8] = \p Lgkmcnt
+///
+/// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
+/// isa \p Version.
+unsigned encodeWaitcnt(IsaVersion Version,
+ unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt);
+
unsigned getInitialPSInputAddr(const Function &F);
bool isShader(CallingConv::ID cc);
@@ -61,6 +150,66 @@ bool isVI(const MCSubtargetInfo &STI);
/// \p STI otherwise return \p Reg.
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
+/// \brief Can this operand also contain immediate values?
+bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
+
+/// \brief Is this floating-point operand?
+bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo);
+
+/// \brief Does this opearnd support only inlinable literals?
+bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);
+
+/// \brief Get the size in bits of a register from the register class \p RC.
+unsigned getRegBitWidth(unsigned RCID);
+
+/// \brief Get the size in bits of a register from the register class \p RC.
+unsigned getRegBitWidth(const MCRegisterClass &RC);
+
+/// \brief Get size of register operand
+unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
+ unsigned OpNo);
+
+LLVM_READNONE
+inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
+ switch (OpInfo.OperandType) {
+ case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ return 4;
+
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ return 8;
+
+ case AMDGPU::OPERAND_REG_IMM_INT16:
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ return 2;
+
+ default:
+ llvm_unreachable("unhandled operand type");
+ }
+}
+
+LLVM_READNONE
+inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
+ return getOperandSize(Desc.OpInfo[OpNo]);
+}
+
+/// \brief Is this literal inlinable
+LLVM_READNONE
+bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
+
+LLVM_READNONE
+bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi);
+
+LLVM_READNONE
+bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
+
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index 3a5ff60..c55eaab 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -16,10 +16,10 @@
#define QNAME(name) amd_kernel_code_t::name
#define FLD_T(name) decltype(QNAME(name)), &QNAME(name)
-#define FIELD2(sname, name) \
- RECORD(sname, printField<FLD_T(name)>, parseField<FLD_T(name)>)
+#define FIELD2(sname, aname, name) \
+ RECORD(sname, aname, printField<FLD_T(name)>, parseField<FLD_T(name)>)
-#define FIELD(name) FIELD2(name, name)
+#define FIELD(name) FIELD2(name, name, name)
#define PRINTCODEPROP(name) \
@@ -33,7 +33,7 @@
AMD_CODE_PROPERTY_##name##_WIDTH>
#define CODEPROP(name, shift) \
- RECORD(name, PRINTCODEPROP(shift), PARSECODEPROP(shift))
+ RECORD(name, name, PRINTCODEPROP(shift), PARSECODEPROP(shift))
// have to define these lambdas because of Set/GetMacro
#define PRINTCOMP(GetMacro, Shift) \
@@ -50,32 +50,70 @@
return true; \
}
-#define COMPPGM(name, GetMacro, SetMacro, Shift) \
- RECORD(name, PRINTCOMP(GetMacro, Shift), PARSECOMP(SetMacro, Shift))
+#define COMPPGM(name, aname, GetMacro, SetMacro, Shift) \
+ RECORD(name, aname, PRINTCOMP(GetMacro, Shift), PARSECOMP(SetMacro, Shift))
-#define COMPPGM1(name, AccMacro) \
- COMPPGM(compute_pgm_rsrc1_##name, \
- G_00B848_##AccMacro, S_00B848_##AccMacro, 0)
+#define COMPPGM1(name, aname, AccMacro) \
+ COMPPGM(name, aname, G_00B848_##AccMacro, S_00B848_##AccMacro, 0)
-#define COMPPGM2(name, AccMacro) \
- COMPPGM(compute_pgm_rsrc2_##name, \
- G_00B84C_##AccMacro, S_00B84C_##AccMacro, 32)
+#define COMPPGM2(name, aname, AccMacro) \
+ COMPPGM(name, aname, G_00B84C_##AccMacro, S_00B84C_##AccMacro, 32)
///////////////////////////////////////////////////////////////////////////////
// Begin of the table
// Define RECORD(name, print, parse) in your code to get field definitions
// and include this file
-FIELD2(kernel_code_version_major, amd_kernel_code_version_major),
-FIELD2(kernel_code_version_minor, amd_kernel_code_version_minor),
-FIELD2(machine_kind, amd_machine_kind),
-FIELD2(machine_version_major, amd_machine_version_major),
-FIELD2(machine_version_minor, amd_machine_version_minor),
-FIELD2(machine_version_stepping, amd_machine_version_stepping),
+FIELD2(amd_code_version_major, kernel_code_version_major, amd_kernel_code_version_major),
+FIELD2(amd_code_version_minor, kernel_code_version_minor, amd_kernel_code_version_minor),
+FIELD2(amd_machine_kind, machine_kind, amd_machine_kind),
+FIELD2(amd_machine_version_major, machine_version_major, amd_machine_version_major),
+FIELD2(amd_machine_version_minor, machine_version_minor, amd_machine_version_minor),
+FIELD2(amd_machine_version_stepping, machine_version_stepping, amd_machine_version_stepping),
+
FIELD(kernel_code_entry_byte_offset),
FIELD(kernel_code_prefetch_byte_size),
FIELD(max_scratch_backing_memory_byte_size),
-FIELD(compute_pgm_resource_registers),
+
+COMPPGM1(granulated_workitem_vgpr_count, compute_pgm_rsrc1_vgprs, VGPRS),
+COMPPGM1(granulated_wavefront_sgpr_count, compute_pgm_rsrc1_sgprs, SGPRS),
+COMPPGM1(priority, compute_pgm_rsrc1_priority, PRIORITY),
+COMPPGM1(float_mode, compute_pgm_rsrc1_float_mode, FLOAT_MODE), // TODO: split float_mode
+COMPPGM1(priv, compute_pgm_rsrc1_priv, PRIV),
+COMPPGM1(enable_dx10_clamp, compute_pgm_rsrc1_dx10_clamp, DX10_CLAMP),
+COMPPGM1(debug_mode, compute_pgm_rsrc1_debug_mode, DEBUG_MODE),
+COMPPGM1(enable_ieee_mode, compute_pgm_rsrc1_ieee_mode, IEEE_MODE),
+// TODO: bulky
+// TODO: cdbg_user
+COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN),
+COMPPGM2(user_sgpr_count, compute_pgm_rsrc2_user_sgpr, USER_SGPR),
+// TODO: enable_trap_handler
+COMPPGM2(enable_sgpr_workgroup_id_x, compute_pgm_rsrc2_tgid_x_en, TGID_X_EN),
+COMPPGM2(enable_sgpr_workgroup_id_y, compute_pgm_rsrc2_tgid_y_en, TGID_Y_EN),
+COMPPGM2(enable_sgpr_workgroup_id_z, compute_pgm_rsrc2_tgid_z_en, TGID_Z_EN),
+COMPPGM2(enable_sgpr_workgroup_info, compute_pgm_rsrc2_tg_size_en, TG_SIZE_EN),
+COMPPGM2(enable_vgpr_workitem_id, compute_pgm_rsrc2_tidig_comp_cnt, TIDIG_COMP_CNT),
+COMPPGM2(enable_exception_msb, compute_pgm_rsrc2_excp_en_msb, EXCP_EN_MSB), // TODO: split enable_exception_msb
+COMPPGM2(granulated_lds_size, compute_pgm_rsrc2_lds_size, LDS_SIZE),
+COMPPGM2(enable_exception, compute_pgm_rsrc2_excp_en, EXCP_EN), // TODO: split enable_exception
+
+CODEPROP(enable_sgpr_private_segment_buffer, ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER),
+CODEPROP(enable_sgpr_dispatch_ptr, ENABLE_SGPR_DISPATCH_PTR),
+CODEPROP(enable_sgpr_queue_ptr, ENABLE_SGPR_QUEUE_PTR),
+CODEPROP(enable_sgpr_kernarg_segment_ptr, ENABLE_SGPR_KERNARG_SEGMENT_PTR),
+CODEPROP(enable_sgpr_dispatch_id, ENABLE_SGPR_DISPATCH_ID),
+CODEPROP(enable_sgpr_flat_scratch_init, ENABLE_SGPR_FLAT_SCRATCH_INIT),
+CODEPROP(enable_sgpr_private_segment_size, ENABLE_SGPR_PRIVATE_SEGMENT_SIZE),
+CODEPROP(enable_sgpr_grid_workgroup_count_x, ENABLE_SGPR_GRID_WORKGROUP_COUNT_X),
+CODEPROP(enable_sgpr_grid_workgroup_count_y, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y),
+CODEPROP(enable_sgpr_grid_workgroup_count_z, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z),
+CODEPROP(enable_ordered_append_gds, ENABLE_ORDERED_APPEND_GDS),
+CODEPROP(private_element_size, PRIVATE_ELEMENT_SIZE),
+CODEPROP(is_ptr64, IS_PTR64),
+CODEPROP(is_dynamic_callstack, IS_DYNAMIC_CALLSTACK),
+CODEPROP(is_debug_enabled, IS_DEBUG_SUPPORTED),
+CODEPROP(is_xnack_enabled, IS_XNACK_SUPPORTED),
+
FIELD(workitem_private_segment_byte_size),
FIELD(workgroup_group_segment_byte_size),
FIELD(gds_segment_byte_size),
@@ -94,59 +132,8 @@ FIELD(group_segment_alignment),
FIELD(private_segment_alignment),
FIELD(wavefront_size),
FIELD(call_convention),
-FIELD(runtime_loader_kernel_symbol),
-
-COMPPGM1(vgprs, VGPRS),
-COMPPGM1(sgprs, SGPRS),
-COMPPGM1(priority, PRIORITY),
-COMPPGM1(float_mode, FLOAT_MODE),
-COMPPGM1(priv, PRIV),
-COMPPGM1(dx10_clamp, DX10_CLAMP),
-COMPPGM1(debug_mode, DEBUG_MODE),
-COMPPGM1(ieee_mode, IEEE_MODE),
-COMPPGM2(scratch_en, SCRATCH_EN),
-COMPPGM2(user_sgpr, USER_SGPR),
-COMPPGM2(tgid_x_en, TGID_X_EN),
-COMPPGM2(tgid_y_en, TGID_Y_EN),
-COMPPGM2(tgid_z_en, TGID_Z_EN),
-COMPPGM2(tg_size_en, TG_SIZE_EN),
-COMPPGM2(tidig_comp_cnt, TIDIG_COMP_CNT),
-COMPPGM2(excp_en_msb, EXCP_EN_MSB),
-COMPPGM2(lds_size, LDS_SIZE),
-COMPPGM2(excp_en, EXCP_EN),
-
-CODEPROP(enable_sgpr_private_segment_buffer,
- ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER),
-CODEPROP(enable_sgpr_dispatch_ptr,
- ENABLE_SGPR_DISPATCH_PTR),
-CODEPROP(enable_sgpr_queue_ptr,
- ENABLE_SGPR_QUEUE_PTR),
-CODEPROP(enable_sgpr_kernarg_segment_ptr,
- ENABLE_SGPR_KERNARG_SEGMENT_PTR),
-CODEPROP(enable_sgpr_dispatch_id,
- ENABLE_SGPR_DISPATCH_ID),
-CODEPROP(enable_sgpr_flat_scratch_init,
- ENABLE_SGPR_FLAT_SCRATCH_INIT),
-CODEPROP(enable_sgpr_private_segment_size,
- ENABLE_SGPR_PRIVATE_SEGMENT_SIZE),
-CODEPROP(enable_sgpr_grid_workgroup_count_x,
- ENABLE_SGPR_GRID_WORKGROUP_COUNT_X),
-CODEPROP(enable_sgpr_grid_workgroup_count_y,
- ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y),
-CODEPROP(enable_sgpr_grid_workgroup_count_z,
- ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z),
-CODEPROP(enable_ordered_append_gds,
- ENABLE_ORDERED_APPEND_GDS),
-CODEPROP(private_element_size,
- PRIVATE_ELEMENT_SIZE),
-CODEPROP(is_ptr64,
- IS_PTR64),
-CODEPROP(is_dynamic_callstack,
- IS_DYNAMIC_CALLSTACK),
-CODEPROP(is_debug_enabled,
- IS_DEBUG_SUPPORTED),
-CODEPROP(is_xnack_enabled,
- IS_XNACK_SUPPORTED)
+FIELD(runtime_loader_kernel_symbol)
+// TODO: control_directive
// end of the table
///////////////////////////////////////////////////////////////////////////////
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
index f64973a..0333b0a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -24,22 +24,37 @@ using namespace llvm;
static ArrayRef<StringRef> get_amd_kernel_code_t_FldNames() {
static StringRef const Table[] = {
"", // not found placeholder
-#define RECORD(name, print, parse) #name
+#define RECORD(name, altName, print, parse) #name
#include "AMDKernelCodeTInfo.h"
#undef RECORD
};
return makeArrayRef(Table);
}
-static StringMap<int> createIndexMap(const ArrayRef<StringRef> &a) {
+static ArrayRef<StringRef> get_amd_kernel_code_t_FldAltNames() {
+ static StringRef const Table[] = {
+ "", // not found placeholder
+#define RECORD(name, altName, print, parse) #altName
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+ };
+ return makeArrayRef(Table);
+}
+
+static StringMap<int> createIndexMap(const ArrayRef<StringRef> &names,
+ const ArrayRef<StringRef> &altNames) {
StringMap<int> map;
- for (auto Name : a)
- map.insert(std::make_pair(Name, map.size()));
+ assert(names.size() == altNames.size());
+ for (unsigned i = 0; i < names.size(); ++i) {
+ map.insert(std::make_pair(names[i], i));
+ map.insert(std::make_pair(altNames[i], i));
+ }
return map;
}
static int get_amd_kernel_code_t_FieldIndex(StringRef name) {
- static const auto map = createIndexMap(get_amd_kernel_code_t_FldNames());
+ static const auto map = createIndexMap(get_amd_kernel_code_t_FldNames(),
+ get_amd_kernel_code_t_FldAltNames());
return map.lookup(name) - 1; // returns -1 if not found
}
@@ -73,7 +88,7 @@ typedef void(*PrintFx)(StringRef,
static ArrayRef<PrintFx> getPrinterTable() {
static const PrintFx Table[] = {
-#define RECORD(name, print, parse) print
+#define RECORD(name, altName, print, parse) print
#include "AMDKernelCodeTInfo.h"
#undef RECORD
};
@@ -145,7 +160,7 @@ typedef bool(*ParseFx)(amd_kernel_code_t &,
static ArrayRef<ParseFx> getParserTable() {
static const ParseFx Table[] = {
-#define RECORD(name, print, parse) parse
+#define RECORD(name, altName, print, parse) parse
#include "AMDKernelCodeTInfo.h"
#undef RECORD
};
diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td
index 912ed53..1fd1c1e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td
@@ -11,283 +11,6 @@
//
//===----------------------------------------------------------------------===//
-class DSe_vi <bits<8> op> : Enc64 {
- bits<8> vdst;
- bits<1> gds;
- bits<8> addr;
- bits<8> data0;
- bits<8> data1;
- bits<8> offset0;
- bits<8> offset1;
-
- let Inst{7-0} = offset0;
- let Inst{15-8} = offset1;
- let Inst{16} = gds;
- let Inst{24-17} = op;
- let Inst{31-26} = 0x36; //encoding
- let Inst{39-32} = addr;
- let Inst{47-40} = data0;
- let Inst{55-48} = data1;
- let Inst{63-56} = vdst;
-}
-
-class MUBUFe_vi <bits<7> op> : Enc64 {
- bits<12> offset;
- bits<1> offen;
- bits<1> idxen;
- bits<1> glc;
- bits<1> lds;
- bits<8> vaddr;
- bits<8> vdata;
- bits<7> srsrc;
- bits<1> slc;
- bits<1> tfe;
- bits<8> soffset;
-
- let Inst{11-0} = offset;
- let Inst{12} = offen;
- let Inst{13} = idxen;
- let Inst{14} = glc;
- let Inst{16} = lds;
- let Inst{17} = slc;
- let Inst{24-18} = op;
- let Inst{31-26} = 0x38; //encoding
- let Inst{39-32} = vaddr;
- let Inst{47-40} = vdata;
- let Inst{52-48} = srsrc{6-2};
- let Inst{55} = tfe;
- let Inst{63-56} = soffset;
-}
-
-class MTBUFe_vi <bits<4> op> : Enc64 {
- bits<12> offset;
- bits<1> offen;
- bits<1> idxen;
- bits<1> glc;
- bits<4> dfmt;
- bits<3> nfmt;
- bits<8> vaddr;
- bits<8> vdata;
- bits<7> srsrc;
- bits<1> slc;
- bits<1> tfe;
- bits<8> soffset;
-
- let Inst{11-0} = offset;
- let Inst{12} = offen;
- let Inst{13} = idxen;
- let Inst{14} = glc;
- let Inst{18-15} = op;
- let Inst{22-19} = dfmt;
- let Inst{25-23} = nfmt;
- let Inst{31-26} = 0x3a; //encoding
- let Inst{39-32} = vaddr;
- let Inst{47-40} = vdata;
- let Inst{52-48} = srsrc{6-2};
- let Inst{54} = slc;
- let Inst{55} = tfe;
- let Inst{63-56} = soffset;
-}
-
-class SMEMe_vi <bits<8> op, bit imm> : Enc64 {
- bits<7> sbase;
- bits<7> sdst;
- bits<1> glc;
-
- let Inst{5-0} = sbase{6-1};
- let Inst{12-6} = sdst;
- let Inst{16} = glc;
- let Inst{17} = imm;
- let Inst{25-18} = op;
- let Inst{31-26} = 0x30; //encoding
-}
-
-class SMEM_IMMe_vi <bits<8> op> : SMEMe_vi<op, 1> {
- bits<20> offset;
- let Inst{51-32} = offset;
-}
-
-class SMEM_SOFFe_vi <bits<8> op> : SMEMe_vi<op, 0> {
- bits<20> soff;
- let Inst{51-32} = soff;
-}
-
-class VOP3a_vi <bits<10> op> : Enc64 {
- bits<2> src0_modifiers;
- bits<9> src0;
- bits<2> src1_modifiers;
- bits<9> src1;
- bits<2> src2_modifiers;
- bits<9> src2;
- bits<1> clamp;
- bits<2> omod;
-
- let Inst{8} = src0_modifiers{1};
- let Inst{9} = src1_modifiers{1};
- let Inst{10} = src2_modifiers{1};
- let Inst{15} = clamp;
- let Inst{25-16} = op;
- let Inst{31-26} = 0x34; //encoding
- let Inst{40-32} = src0;
- let Inst{49-41} = src1;
- let Inst{58-50} = src2;
- let Inst{60-59} = omod;
- let Inst{61} = src0_modifiers{0};
- let Inst{62} = src1_modifiers{0};
- let Inst{63} = src2_modifiers{0};
-}
-
-class VOP3e_vi <bits<10> op> : VOP3a_vi <op> {
- bits<8> vdst;
-
- let Inst{7-0} = vdst;
-}
-
-// Encoding used for VOPC instructions encoded as VOP3
-// Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
-class VOP3ce_vi <bits<10> op> : VOP3a_vi <op> {
- bits<8> sdst;
-
- let Inst{7-0} = sdst;
-}
-
-class VOP3be_vi <bits<10> op> : Enc64 {
- bits<8> vdst;
- bits<2> src0_modifiers;
- bits<9> src0;
- bits<2> src1_modifiers;
- bits<9> src1;
- bits<2> src2_modifiers;
- bits<9> src2;
- bits<7> sdst;
- bits<2> omod;
- bits<1> clamp;
-
- let Inst{7-0} = vdst;
- let Inst{14-8} = sdst;
- let Inst{15} = clamp;
- let Inst{25-16} = op;
- let Inst{31-26} = 0x34; //encoding
- let Inst{40-32} = src0;
- let Inst{49-41} = src1;
- let Inst{58-50} = src2;
- let Inst{60-59} = omod;
- let Inst{61} = src0_modifiers{0};
- let Inst{62} = src1_modifiers{0};
- let Inst{63} = src2_modifiers{0};
-}
-
-class VOP_DPP <dag outs, dag ins, string asm, list<dag> pattern, bit HasMods = 0> :
- VOPAnyCommon <outs, ins, asm, pattern> {
- let DPP = 1;
- let Size = 8;
-
- let AsmMatchConverter = !if(!eq(HasMods,1), "cvtDPP", "");
-}
-
-class VOP_DPPe : Enc64 {
- bits<2> src0_modifiers;
- bits<8> src0;
- bits<2> src1_modifiers;
- bits<9> dpp_ctrl;
- bits<1> bound_ctrl;
- bits<4> bank_mask;
- bits<4> row_mask;
-
- let Inst{39-32} = src0;
- let Inst{48-40} = dpp_ctrl;
- let Inst{51} = bound_ctrl;
- let Inst{52} = src0_modifiers{0}; // src0_neg
- let Inst{53} = src0_modifiers{1}; // src0_abs
- let Inst{54} = src1_modifiers{0}; // src1_neg
- let Inst{55} = src1_modifiers{1}; // src1_abs
- let Inst{59-56} = bank_mask;
- let Inst{63-60} = row_mask;
-}
-
-class VOP1_DPPe <bits<8> op> : VOP_DPPe {
- bits<8> vdst;
-
- let Inst{8-0} = 0xfa; // dpp
- let Inst{16-9} = op;
- let Inst{24-17} = vdst;
- let Inst{31-25} = 0x3f; //encoding
-}
-
-class VOP2_DPPe <bits<6> op> : VOP_DPPe {
- bits<8> vdst;
- bits<8> src1;
-
- let Inst{8-0} = 0xfa; //dpp
- let Inst{16-9} = src1;
- let Inst{24-17} = vdst;
- let Inst{30-25} = op;
- let Inst{31} = 0x0; //encoding
-}
-
-class VOP_SDWA <dag outs, dag ins, string asm, list<dag> pattern, bit HasMods = 0> :
- VOPAnyCommon <outs, ins, asm, pattern> {
- let SDWA = 1;
- let Size = 8;
-}
-
-class VOP_SDWAe : Enc64 {
- bits<8> src0;
- bits<3> src0_sel;
- bits<2> src0_fmodifiers; // {abs,neg}
- bits<1> src0_imodifiers; // sext
- bits<3> src1_sel;
- bits<2> src1_fmodifiers;
- bits<1> src1_imodifiers;
- bits<3> dst_sel;
- bits<2> dst_unused;
- bits<1> clamp;
-
- let Inst{39-32} = src0;
- let Inst{42-40} = dst_sel;
- let Inst{44-43} = dst_unused;
- let Inst{45} = clamp;
- let Inst{50-48} = src0_sel;
- let Inst{53-52} = src0_fmodifiers;
- let Inst{51} = src0_imodifiers;
- let Inst{58-56} = src1_sel;
- let Inst{61-60} = src1_fmodifiers;
- let Inst{59} = src1_imodifiers;
-}
-
-class VOP1_SDWAe <bits<8> op> : VOP_SDWAe {
- bits<8> vdst;
-
- let Inst{8-0} = 0xf9; // sdwa
- let Inst{16-9} = op;
- let Inst{24-17} = vdst;
- let Inst{31-25} = 0x3f; // encoding
-}
-
-class VOP2_SDWAe <bits<6> op> : VOP_SDWAe {
- bits<8> vdst;
- bits<8> src1;
-
- let Inst{8-0} = 0xf9; // sdwa
- let Inst{16-9} = src1;
- let Inst{24-17} = vdst;
- let Inst{30-25} = op;
- let Inst{31} = 0x0; // encoding
-}
-
-class VOPC_SDWAe <bits<8> op> : VOP_SDWAe {
- bits<8> src1;
-
- let Inst{8-0} = 0xf9; // sdwa
- let Inst{16-9} = src1;
- let Inst{24-17} = op;
- let Inst{31-25} = 0x3e; // encoding
-
- // VOPC disallows dst_sel and dst_unused as they have no effect on destination
- let Inst{42-40} = 0x6;
- let Inst{44-43} = 0x2;
-}
-
class EXPe_vi : EXPe {
let Inst{31-26} = 0x31; //encoding
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
index 5c490ab..b45c8fc 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
@@ -9,150 +9,6 @@
// Instruction definitions for VI and newer.
//===----------------------------------------------------------------------===//
-let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in {
-
-let DisableSIDecoder = 1 in {
-
-//===----------------------------------------------------------------------===//
-// VOP1 Instructions
-//===----------------------------------------------------------------------===//
-
-defm V_CVT_F16_U16 : VOP1Inst <vop1<0, 0x39>, "v_cvt_f16_u16", VOP_F16_I16>;
-defm V_CVT_F16_I16 : VOP1Inst <vop1<0, 0x3a>, "v_cvt_f16_i16", VOP_F16_I16>;
-defm V_CVT_U16_F16 : VOP1Inst <vop1<0, 0x3b>, "v_cvt_u16_f16", VOP_I16_F16>;
-defm V_CVT_I16_F16 : VOP1Inst <vop1<0, 0x3c>, "v_cvt_i16_f16", VOP_I16_F16>;
-defm V_RCP_F16 : VOP1Inst <vop1<0, 0x3d>, "v_rcp_f16", VOP_F16_F16>;
-defm V_SQRT_F16 : VOP1Inst <vop1<0, 0x3e>, "v_sqrt_f16", VOP_F16_F16>;
-defm V_RSQ_F16 : VOP1Inst <vop1<0, 0x3f>, "v_rsq_f16", VOP_F16_F16>;
-defm V_LOG_F16 : VOP1Inst <vop1<0, 0x40>, "v_log_f16", VOP_F16_F16>;
-defm V_EXP_F16 : VOP1Inst <vop1<0, 0x41>, "v_exp_f16", VOP_F16_F16>;
-defm V_FREXP_MANT_F16 : VOP1Inst <vop1<0, 0x42>, "v_frexp_mant_f16",
- VOP_F16_F16
->;
-defm V_FREXP_EXP_I16_F16 : VOP1Inst <vop1<0, 0x43>, "v_frexp_exp_i16_f16",
- VOP_I16_F16
->;
-defm V_FLOOR_F16 : VOP1Inst <vop1<0, 0x44>, "v_floor_f16", VOP_F16_F16>;
-defm V_CEIL_F16 : VOP1Inst <vop1<0, 0x45>, "v_ceil_f16", VOP_F16_F16>;
-defm V_TRUNC_F16 : VOP1Inst <vop1<0, 0x46>, "v_trunc_f16", VOP_F16_F16>;
-defm V_RNDNE_F16 : VOP1Inst <vop1<0, 0x47>, "v_rndne_f16", VOP_F16_F16>;
-defm V_FRACT_F16 : VOP1Inst <vop1<0, 0x48>, "v_fract_f16", VOP_F16_F16>;
-defm V_SIN_F16 : VOP1Inst <vop1<0, 0x49>, "v_sin_f16", VOP_F16_F16>;
-defm V_COS_F16 : VOP1Inst <vop1<0, 0x4a>, "v_cos_f16", VOP_F16_F16>;
-
-//===----------------------------------------------------------------------===//
-// VOP2 Instructions
-//===----------------------------------------------------------------------===//
-
-let isCommutable = 1 in {
-
-defm V_ADD_F16 : VOP2Inst <vop2<0, 0x1f>, "v_add_f16", VOP_F16_F16_F16>;
-defm V_SUB_F16 : VOP2Inst <vop2<0, 0x20>, "v_sub_f16", VOP_F16_F16_F16>;
-defm V_SUBREV_F16 : VOP2Inst <vop2<0, 0x21>, "v_subrev_f16", VOP_F16_F16_F16,
- null_frag, "v_sub_f16"
->;
-defm V_MUL_F16 : VOP2Inst <vop2<0, 0x22>, "v_mul_f16", VOP_F16_F16_F16>;
-defm V_MAC_F16 : VOP2Inst <vop2<0, 0x23>, "v_mac_f16", VOP_F16_F16_F16>;
-} // End isCommutable = 1
-defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16", VOP_MADMK>;
-let isCommutable = 1 in {
-defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16", VOP_MADAK>;
-defm V_ADD_U16 : VOP2Inst <vop2<0,0x26>, "v_add_u16", VOP_I16_I16_I16>;
-defm V_SUB_U16 : VOP2Inst <vop2<0,0x27>, "v_sub_u16" , VOP_I16_I16_I16>;
-defm V_SUBREV_U16 : VOP2Inst <vop2<0,0x28>, "v_subrev_u16", VOP_I16_I16_I16>;
-defm V_MUL_LO_U16 : VOP2Inst <vop2<0,0x29>, "v_mul_lo_u16", VOP_I16_I16_I16>;
-} // End isCommutable = 1
-defm V_LSHLREV_B16 : VOP2Inst <vop2<0,0x2a>, "v_lshlrev_b16", VOP_I16_I16_I16>;
-defm V_LSHRREV_B16 : VOP2Inst <vop2<0,0x2b>, "v_lshrrev_b16", VOP_I16_I16_I16>;
-defm V_ASHRREV_B16 : VOP2Inst <vop2<0,0x2c>, "v_ashrrev_b16", VOP_I16_I16_I16>;
-let isCommutable = 1 in {
-defm V_MAX_F16 : VOP2Inst <vop2<0,0x2d>, "v_max_f16", VOP_F16_F16_F16>;
-defm V_MIN_F16 : VOP2Inst <vop2<0,0x2e>, "v_min_f16", VOP_F16_F16_F16>;
-defm V_MAX_U16 : VOP2Inst <vop2<0,0x2f>, "v_max_u16", VOP_I16_I16_I16>;
-defm V_MAX_I16 : VOP2Inst <vop2<0,0x30>, "v_max_i16", VOP_I16_I16_I16>;
-defm V_MIN_U16 : VOP2Inst <vop2<0,0x31>, "v_min_u16", VOP_I16_I16_I16>;
-defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>;
-} // End isCommutable = 1
-defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>;
-
-//===----------------------------------------------------------------------===//
-// VOP3 Instructions
-//===----------------------------------------------------------------------===//
-let isCommutable = 1 in {
- defm V_MAD_F16 : VOP3Inst <vop3<0, 0x1ea>, "v_mad_f16", VOP_F16_F16_F16_F16>;
- defm V_MAD_U16 : VOP3Inst <vop3<0, 0x1eb>, "v_mad_u16", VOP_I16_I16_I16_I16>;
- defm V_MAD_I16 : VOP3Inst <vop3<0, 0x1ec>, "v_mad_i16", VOP_I16_I16_I16_I16>;
-}
-} // let DisableSIDecoder = 1
-
-// Aliases to simplify matching of floating-point instructions that
-// are VOP2 on SI and VOP3 on VI.
-
-class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
- name#" $dst, $src0, $src1",
- (inst VGPR_32:$dst, 0, VCSrc_32:$src0, 0, VCSrc_32:$src1, 0, 0)
->, PredicateControl {
- let UseInstAsmMatchConverter = 0;
-}
-
-def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>;
-def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>;
-def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
-def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
-def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
-
-//===----------------------------------------------------------------------===//
-// SMEM Instructions
-//===----------------------------------------------------------------------===//
-
-def S_DCACHE_WB : SMEM_Inval <0x21,
- "s_dcache_wb", int_amdgcn_s_dcache_wb>;
-
-def S_DCACHE_WB_VOL : SMEM_Inval <0x23,
- "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
-
-def S_MEMREALTIME : SMEM_Ret<0x25,
- "s_memrealtime", int_amdgcn_s_memrealtime>;
-
-} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
-
-let Predicates = [isVI] in {
-
-// 1. Offset as 20bit DWORD immediate
-def : Pat <
- (SIload_constant v4i32:$sbase, IMM20bit:$offset),
- (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
->;
-
-//===----------------------------------------------------------------------===//
-// DPP Patterns
-//===----------------------------------------------------------------------===//
-
-def : Pat <
- (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
- imm:$bound_ctrl),
- (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
- (as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
->;
-
-//===----------------------------------------------------------------------===//
-// Misc Patterns
-//===----------------------------------------------------------------------===//
-
-def : Pat <
- (i64 (readcyclecounter)),
- (S_MEMREALTIME)
->;
-
-//===----------------------------------------------------------------------===//
-// DS_PERMUTE/DS_BPERMUTE Instructions.
-//===----------------------------------------------------------------------===//
-
-let Uses = [EXEC] in {
-defm DS_PERMUTE_B32 : DS_1A1D_PERMUTE <0x3e, "ds_permute_b32", VGPR_32,
- int_amdgcn_ds_permute>;
-defm DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <0x3f, "ds_bpermute_b32", VGPR_32,
- int_amdgcn_ds_bpermute>;
-}
-
-} // End Predicates = [isVI]
+FIXME: Deleting this file broke buildbots that don't do full rebuilds. This
+file is no longer used by the backend, so it can be deleted once all
+the buildbots update there dependencies.
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
new file mode 100644
index 0000000..8cae83c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -0,0 +1,615 @@
+//===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VOP1 Classes
+//===----------------------------------------------------------------------===//
+
+class VOP1e <bits<8> op, VOPProfile P> : Enc32 {
+ bits<8> vdst;
+ bits<9> src0;
+
+ let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, 0);
+ let Inst{16-9} = op;
+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{31-25} = 0x3f; //encoding
+}
+
+class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
+ bits<8> vdst;
+
+ let Inst{8-0} = 0xf9; // sdwa
+ let Inst{16-9} = op;
+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{31-25} = 0x3f; // encoding
+}
+
+class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
+ InstSI <P.Outs32, P.Ins32, "", pattern>,
+ VOP <opName>,
+ SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>,
+ MnemonicAlias<opName#"_e32", opName> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = P.Asm32;
+
+ let Size = 4;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SubtargetPredicate = isGCN;
+
+ let VOP1 = 1;
+ let VALU = 1;
+ let Uses = [EXEC];
+
+ let AsmVariantName = AMDGPUAsmVariants.Default;
+
+ VOPProfile Pfl = P;
+}
+
+class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let AsmVariantName = ps.AsmVariantName;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+}
+
+class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+ VOP_SDWA_Pseudo <OpName, P, pattern> {
+ let AsmMatchConverter = "cvtSdwaVOP1";
+}
+
+class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
+ list<dag> ret = !if(P.HasModifiers,
+ [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+ i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
+ [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]);
+}
+
+multiclass VOP1Inst <string opName, VOPProfile P,
+ SDPatternOperator node = null_frag> {
+ def _e32 : VOP1_Pseudo <opName, P>;
+ def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
+ def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+let VOPAsmPrefer32Bit = 1 in {
+defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>;
+}
+
+let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
+} // End isMoveImm = 1
+
+// FIXME: Specify SchedRW for READFIRSTLANE_B32
+// TODO: Make profile for this, there is VOP3 encoding also
+def V_READFIRSTLANE_B32 :
+ InstSI <(outs SReg_32:$vdst),
+ (ins VGPR_32:$src0),
+ "v_readfirstlane_b32 $vdst, $src0",
+ [(set i32:$vdst, (int_amdgcn_readfirstlane i32:$src0))]>,
+ Enc32 {
+
+ let isCodeGenOnly = 0;
+ let UseNamedOperandTable = 1;
+
+ let Size = 4;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SubtargetPredicate = isGCN;
+
+ let VOP1 = 1;
+ let VALU = 1;
+ let Uses = [EXEC];
+ let isConvergent = 1;
+
+ bits<8> vdst;
+ bits<9> src0;
+
+ let Inst{8-0} = src0;
+ let Inst{16-9} = 0x2;
+ let Inst{24-17} = vdst;
+ let Inst{31-25} = 0x3f; //encoding
+}
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
+defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP_F64_I32, sint_to_fp>;
+defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>;
+defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>;
+defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
+defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
+defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>;
+defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>;
+defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
+defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
+defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP_F32_I32>;
+defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
+defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
+defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0>;
+defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1>;
+defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2>;
+defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3>;
+defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
+defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP_F64_I32, uint_to_fp>;
+} // End SchedRW = [WriteQuarterRate32]
+
+defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
+defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
+defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
+defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
+defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
+defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
+defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
+defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>;
+defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
+} // End SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
+defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
+} // End SchedRW = [WriteDouble];
+
+defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
+
+let SchedRW = [WriteDouble] in {
+defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
+} // End SchedRW = [WriteDouble]
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
+defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
+} // End SchedRW = [WriteQuarterRate32]
+
+defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
+defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
+defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
+defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
+defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
+defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
+
+let SchedRW = [WriteDoubleAdd] in {
+defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
+defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
+} // End SchedRW = [WriteDoubleAdd]
+
+defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
+defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;
+
+let VOPAsmPrefer32Bit = 1 in {
+defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
+}
+
+// Restrict src0 to be VGPR
+def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {
+ let Src0RC32 = VRegSrc_32;
+ let Src0RC64 = VRegSrc_32;
+
+ let HasExt = 0;
+}
+
+// Special case because there are no true output operands. Hack vdst
+// to be a src operand. The custom inserter must add a tied implicit
+// def and use of the super register since there seems to be no way to
+// add an implicit def of a virtual register in tablegen.
+def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
+ let Src0RC32 = VOPDstOperand<VGPR_32>;
+ let Src0RC64 = VOPDstOperand<VGPR_32>;
+
+ let Outs = (outs);
+ let Ins32 = (ins Src0RC32:$vdst, VSrc_b32:$src0);
+ let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
+ let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0,
+ clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel);
+
+ let Asm32 = getAsm32<1, 1>.ret;
+ let Asm64 = getAsm64<1, 1, 0>.ret;
+ let AsmDPP = getAsmDPP<1, 1, 0>.ret;
+ let AsmSDWA = getAsmSDWA<1, 1, 0>.ret;
+
+ let HasExt = 0;
+ let HasDst = 0;
+ let EmitDst = 1; // force vdst emission
+}
+
+let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in {
+// v_movreld_b32 is a special case because the destination output
+ // register is really a source. It isn't actually read (but may be
+ // written), and is only to provide the base register to start
+ // indexing from. Tablegen seems to not let you define an implicit
+ // virtual register output for the super register being written into,
+ // so this must have an implicit def of the register added to it.
+defm V_MOVRELD_B32 : VOP1Inst <"v_movreld_b32", VOP_MOVRELD>;
+defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
+defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
+} // End Uses = [M0, EXEC]
+
+// These instruction only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
+defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
+defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>;
+defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
+defm V_RSQ_CLAMP_F32 : VOP1Inst <"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
+defm V_RSQ_LEGACY_F32 : VOP1Inst <"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>;
+} // End SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+defm V_RCP_CLAMP_F64 : VOP1Inst <"v_rcp_clamp_f64", VOP_F64_F64>;
+defm V_RSQ_CLAMP_F64 : VOP1Inst <"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
+} // End SchedRW = [WriteDouble]
+
+} // End SubtargetPredicate = isSICI
+
+
+let SubtargetPredicate = isCIVI in {
+
+let SchedRW = [WriteDoubleAdd] in {
+defm V_TRUNC_F64 : VOP1Inst <"v_trunc_f64", VOP_F64_F64, ftrunc>;
+defm V_CEIL_F64 : VOP1Inst <"v_ceil_f64", VOP_F64_F64, fceil>;
+defm V_FLOOR_F64 : VOP1Inst <"v_floor_f64", VOP_F64_F64, ffloor>;
+defm V_RNDNE_F64 : VOP1Inst <"v_rndne_f64", VOP_F64_F64, frint>;
+} // End SchedRW = [WriteDoubleAdd]
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_LOG_LEGACY_F32 : VOP1Inst <"v_log_legacy_f32", VOP_F32_F32>;
+defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
+} // End SchedRW = [WriteQuarterRate32]
+
+} // End SubtargetPredicate = isCIVI
+
+
+let SubtargetPredicate = isVI in {
+
+defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>;
+defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>;
+defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
+defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
+defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
+defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
+defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
+defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
+defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
+defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
+defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
+defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
+defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
+defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
+defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
+defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
+defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
+defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
+
+}
+
+let Predicates = [isVI] in {
+
+def : Pat<
+ (f32 (f16_to_fp i16:$src)),
+ (V_CVT_F32_F16_e32 $src)
+>;
+
+def : Pat<
+ (i16 (fp_to_f16 f32:$src)),
+ (V_CVT_F16_F32_e32 $src)
+>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+multiclass VOP1_Real_si <bits<9> op> {
+ let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+ def _e32_si :
+ VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
+ def _e64_si :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+}
+
+defm V_NOP : VOP1_Real_si <0x0>;
+defm V_MOV_B32 : VOP1_Real_si <0x1>;
+defm V_CVT_I32_F64 : VOP1_Real_si <0x3>;
+defm V_CVT_F64_I32 : VOP1_Real_si <0x4>;
+defm V_CVT_F32_I32 : VOP1_Real_si <0x5>;
+defm V_CVT_F32_U32 : VOP1_Real_si <0x6>;
+defm V_CVT_U32_F32 : VOP1_Real_si <0x7>;
+defm V_CVT_I32_F32 : VOP1_Real_si <0x8>;
+defm V_MOV_FED_B32 : VOP1_Real_si <0x9>;
+defm V_CVT_F16_F32 : VOP1_Real_si <0xa>;
+defm V_CVT_F32_F16 : VOP1_Real_si <0xb>;
+defm V_CVT_RPI_I32_F32 : VOP1_Real_si <0xc>;
+defm V_CVT_FLR_I32_F32 : VOP1_Real_si <0xd>;
+defm V_CVT_OFF_F32_I4 : VOP1_Real_si <0xe>;
+defm V_CVT_F32_F64 : VOP1_Real_si <0xf>;
+defm V_CVT_F64_F32 : VOP1_Real_si <0x10>;
+defm V_CVT_F32_UBYTE0 : VOP1_Real_si <0x11>;
+defm V_CVT_F32_UBYTE1 : VOP1_Real_si <0x12>;
+defm V_CVT_F32_UBYTE2 : VOP1_Real_si <0x13>;
+defm V_CVT_F32_UBYTE3 : VOP1_Real_si <0x14>;
+defm V_CVT_U32_F64 : VOP1_Real_si <0x15>;
+defm V_CVT_F64_U32 : VOP1_Real_si <0x16>;
+defm V_FRACT_F32 : VOP1_Real_si <0x20>;
+defm V_TRUNC_F32 : VOP1_Real_si <0x21>;
+defm V_CEIL_F32 : VOP1_Real_si <0x22>;
+defm V_RNDNE_F32 : VOP1_Real_si <0x23>;
+defm V_FLOOR_F32 : VOP1_Real_si <0x24>;
+defm V_EXP_F32 : VOP1_Real_si <0x25>;
+defm V_LOG_CLAMP_F32 : VOP1_Real_si <0x26>;
+defm V_LOG_F32 : VOP1_Real_si <0x27>;
+defm V_RCP_CLAMP_F32 : VOP1_Real_si <0x28>;
+defm V_RCP_LEGACY_F32 : VOP1_Real_si <0x29>;
+defm V_RCP_F32 : VOP1_Real_si <0x2a>;
+defm V_RCP_IFLAG_F32 : VOP1_Real_si <0x2b>;
+defm V_RSQ_CLAMP_F32 : VOP1_Real_si <0x2c>;
+defm V_RSQ_LEGACY_F32 : VOP1_Real_si <0x2d>;
+defm V_RSQ_F32 : VOP1_Real_si <0x2e>;
+defm V_RCP_F64 : VOP1_Real_si <0x2f>;
+defm V_RCP_CLAMP_F64 : VOP1_Real_si <0x30>;
+defm V_RSQ_F64 : VOP1_Real_si <0x31>;
+defm V_RSQ_CLAMP_F64 : VOP1_Real_si <0x32>;
+defm V_SQRT_F32 : VOP1_Real_si <0x33>;
+defm V_SQRT_F64 : VOP1_Real_si <0x34>;
+defm V_SIN_F32 : VOP1_Real_si <0x35>;
+defm V_COS_F32 : VOP1_Real_si <0x36>;
+defm V_NOT_B32 : VOP1_Real_si <0x37>;
+defm V_BFREV_B32 : VOP1_Real_si <0x38>;
+defm V_FFBH_U32 : VOP1_Real_si <0x39>;
+defm V_FFBL_B32 : VOP1_Real_si <0x3a>;
+defm V_FFBH_I32 : VOP1_Real_si <0x3b>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_si <0x3c>;
+defm V_FREXP_MANT_F64 : VOP1_Real_si <0x3d>;
+defm V_FRACT_F64 : VOP1_Real_si <0x3e>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_si <0x3f>;
+defm V_FREXP_MANT_F32 : VOP1_Real_si <0x40>;
+defm V_CLREXCP : VOP1_Real_si <0x41>;
+defm V_MOVRELD_B32 : VOP1_Real_si <0x42>;
+defm V_MOVRELS_B32 : VOP1_Real_si <0x43>;
+defm V_MOVRELSD_B32 : VOP1_Real_si <0x44>;
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+multiclass VOP1_Real_ci <bits<9> op> {
+ let AssemblerPredicates = [isCIOnly], DecoderNamespace = "CI" in {
+ def _e32_ci :
+ VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
+ def _e64_ci :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+}
+
+defm V_TRUNC_F64 : VOP1_Real_ci <0x17>;
+defm V_CEIL_F64 : VOP1_Real_ci <0x18>;
+defm V_FLOOR_F64 : VOP1_Real_ci <0x1A>;
+defm V_RNDNE_F64 : VOP1_Real_ci <0x19>;
+defm V_LOG_LEGACY_F32 : VOP1_Real_ci <0x45>;
+defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>;
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
+ VOP_DPP <ps.OpName, P> {
+ let Defs = ps.Defs;
+ let Uses = ps.Uses;
+ let SchedRW = ps.SchedRW;
+ let hasSideEffects = ps.hasSideEffects;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ bits<8> vdst;
+ let Inst{8-0} = 0xfa; // dpp
+ let Inst{16-9} = op;
+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{31-25} = 0x3f; //encoding
+}
+
+multiclass VOP1_Real_vi <bits<10> op> {
+ let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+ def _e32_vi :
+ VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
+ VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
+ def _e64_vi :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3e_vi <!add(0x140, op), !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+
+ def _sdwa_vi :
+ VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+ // For now left dpp only for asm/dasm
+ // TODO: add corresponding pseudo
+ def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+}
+
+defm V_NOP : VOP1_Real_vi <0x0>;
+defm V_MOV_B32 : VOP1_Real_vi <0x1>;
+defm V_CVT_I32_F64 : VOP1_Real_vi <0x3>;
+defm V_CVT_F64_I32 : VOP1_Real_vi <0x4>;
+defm V_CVT_F32_I32 : VOP1_Real_vi <0x5>;
+defm V_CVT_F32_U32 : VOP1_Real_vi <0x6>;
+defm V_CVT_U32_F32 : VOP1_Real_vi <0x7>;
+defm V_CVT_I32_F32 : VOP1_Real_vi <0x8>;
+defm V_CVT_F16_F32 : VOP1_Real_vi <0xa>;
+defm V_CVT_F32_F16 : VOP1_Real_vi <0xb>;
+defm V_CVT_RPI_I32_F32 : VOP1_Real_vi <0xc>;
+defm V_CVT_FLR_I32_F32 : VOP1_Real_vi <0xd>;
+defm V_CVT_OFF_F32_I4 : VOP1_Real_vi <0xe>;
+defm V_CVT_F32_F64 : VOP1_Real_vi <0xf>;
+defm V_CVT_F64_F32 : VOP1_Real_vi <0x10>;
+defm V_CVT_F32_UBYTE0 : VOP1_Real_vi <0x11>;
+defm V_CVT_F32_UBYTE1 : VOP1_Real_vi <0x12>;
+defm V_CVT_F32_UBYTE2 : VOP1_Real_vi <0x13>;
+defm V_CVT_F32_UBYTE3 : VOP1_Real_vi <0x14>;
+defm V_CVT_U32_F64 : VOP1_Real_vi <0x15>;
+defm V_CVT_F64_U32 : VOP1_Real_vi <0x16>;
+defm V_FRACT_F32 : VOP1_Real_vi <0x1b>;
+defm V_TRUNC_F32 : VOP1_Real_vi <0x1c>;
+defm V_CEIL_F32 : VOP1_Real_vi <0x1d>;
+defm V_RNDNE_F32 : VOP1_Real_vi <0x1e>;
+defm V_FLOOR_F32 : VOP1_Real_vi <0x1f>;
+defm V_EXP_F32 : VOP1_Real_vi <0x20>;
+defm V_LOG_F32 : VOP1_Real_vi <0x21>;
+defm V_RCP_F32 : VOP1_Real_vi <0x22>;
+defm V_RCP_IFLAG_F32 : VOP1_Real_vi <0x23>;
+defm V_RSQ_F32 : VOP1_Real_vi <0x24>;
+defm V_RCP_F64 : VOP1_Real_vi <0x25>;
+defm V_RSQ_F64 : VOP1_Real_vi <0x26>;
+defm V_SQRT_F32 : VOP1_Real_vi <0x27>;
+defm V_SQRT_F64 : VOP1_Real_vi <0x28>;
+defm V_SIN_F32 : VOP1_Real_vi <0x29>;
+defm V_COS_F32 : VOP1_Real_vi <0x2a>;
+defm V_NOT_B32 : VOP1_Real_vi <0x2b>;
+defm V_BFREV_B32 : VOP1_Real_vi <0x2c>;
+defm V_FFBH_U32 : VOP1_Real_vi <0x2d>;
+defm V_FFBL_B32 : VOP1_Real_vi <0x2e>;
+defm V_FFBH_I32 : VOP1_Real_vi <0x2f>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_vi <0x30>;
+defm V_FREXP_MANT_F64 : VOP1_Real_vi <0x31>;
+defm V_FRACT_F64 : VOP1_Real_vi <0x32>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_vi <0x33>;
+defm V_FREXP_MANT_F32 : VOP1_Real_vi <0x34>;
+defm V_CLREXCP : VOP1_Real_vi <0x35>;
+defm V_MOVRELD_B32 : VOP1_Real_vi <0x36>;
+defm V_MOVRELS_B32 : VOP1_Real_vi <0x37>;
+defm V_MOVRELSD_B32 : VOP1_Real_vi <0x38>;
+defm V_TRUNC_F64 : VOP1_Real_vi <0x17>;
+defm V_CEIL_F64 : VOP1_Real_vi <0x18>;
+defm V_FLOOR_F64 : VOP1_Real_vi <0x1A>;
+defm V_RNDNE_F64 : VOP1_Real_vi <0x19>;
+defm V_LOG_LEGACY_F32 : VOP1_Real_vi <0x4c>;
+defm V_EXP_LEGACY_F32 : VOP1_Real_vi <0x4b>;
+defm V_CVT_F16_U16 : VOP1_Real_vi <0x39>;
+defm V_CVT_F16_I16 : VOP1_Real_vi <0x3a>;
+defm V_CVT_U16_F16 : VOP1_Real_vi <0x3b>;
+defm V_CVT_I16_F16 : VOP1_Real_vi <0x3c>;
+defm V_RCP_F16 : VOP1_Real_vi <0x3d>;
+defm V_SQRT_F16 : VOP1_Real_vi <0x3e>;
+defm V_RSQ_F16 : VOP1_Real_vi <0x3f>;
+defm V_LOG_F16 : VOP1_Real_vi <0x40>;
+defm V_EXP_F16 : VOP1_Real_vi <0x41>;
+defm V_FREXP_MANT_F16 : VOP1_Real_vi <0x42>;
+defm V_FREXP_EXP_I16_F16 : VOP1_Real_vi <0x43>;
+defm V_FLOOR_F16 : VOP1_Real_vi <0x44>;
+defm V_CEIL_F16 : VOP1_Real_vi <0x45>;
+defm V_TRUNC_F16 : VOP1_Real_vi <0x46>;
+defm V_RNDNE_F16 : VOP1_Real_vi <0x47>;
+defm V_FRACT_F16 : VOP1_Real_vi <0x48>;
+defm V_SIN_F16 : VOP1_Real_vi <0x49>;
+defm V_COS_F16 : VOP1_Real_vi <0x4a>;
+
+
+// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
+// indexing mode. vdst can't be treated as a def for codegen purposes,
+// and an implicit use and def of the super register should be added.
+def V_MOV_B32_indirect : VPseudoInstSI<(outs),
+ (ins getVALUDstForVT<i32>.ret:$vdst, getVOPSrc0ForVT<i32>.ret:$src0)>,
+ PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst,
+ getVOPSrc0ForVT<i32>.ret:$src0)> {
+ let VOP1 = 1;
+ let SubtargetPredicate = isVI;
+}
+
+// This is a pseudo variant of the v_movreld_b32 instruction in which the
+// vector operand appears only twice, once as def and once as use. Using this
+// pseudo avoids problems with the Two Address instructions pass.
+class V_MOVRELD_B32_pseudo<RegisterClass rc> : VPseudoInstSI <
+ (outs rc:$vdst),
+ (ins rc:$vsrc, VSrc_b32:$val, i32imm:$offset)> {
+ let VOP1 = 1;
+
+ let Constraints = "$vsrc = $vdst";
+ let Uses = [M0, EXEC];
+
+ let SubtargetPredicate = HasMovrel;
+}
+
+def V_MOVRELD_B32_V1 : V_MOVRELD_B32_pseudo<VGPR_32>;
+def V_MOVRELD_B32_V2 : V_MOVRELD_B32_pseudo<VReg_64>;
+def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
+def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
+def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
+
+let Predicates = [isVI] in {
+
+def : Pat <
+ (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
+ imm:$bound_ctrl)),
+ (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
+ (as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
+>;
+
+
+def : Pat<
+ (i32 (anyext i16:$src)),
+ (COPY $src)
+>;
+
+def : Pat<
+ (i64 (anyext i16:$src)),
+ (REG_SEQUENCE VReg_64,
+ (i32 (COPY $src)), sub0,
+ (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+def : Pat<
+ (i16 (trunc i32:$src)),
+ (COPY $src)
+>;
+
+def : Pat <
+ (i16 (trunc i64:$src)),
+ (EXTRACT_SUBREG $src, sub0)
+>;
+
+} // End Predicates = [isVI]
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
new file mode 100644
index 0000000..00e5ab3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -0,0 +1,757 @@
+//===-- VOP2Instructions.td - Vector Instruction Defintions ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VOP2 Classes
+//===----------------------------------------------------------------------===//
+
+class VOP2e <bits<6> op, VOPProfile P> : Enc32 {
+ bits<8> vdst;
+ bits<9> src0;
+ bits<8> src1;
+
+ let Inst{8-0} = !if(P.HasSrc0, src0, 0);
+ let Inst{16-9} = !if(P.HasSrc1, src1, 0);
+ let Inst{24-17} = !if(P.EmitDst, vdst, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0; //encoding
+}
+
+class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 {
+ bits<8> vdst;
+ bits<9> src0;
+ bits<8> src1;
+ bits<32> imm;
+
+ let Inst{8-0} = !if(P.HasSrc0, src0, 0);
+ let Inst{16-9} = !if(P.HasSrc1, src1, 0);
+ let Inst{24-17} = !if(P.EmitDst, vdst, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0; // encoding
+ let Inst{63-32} = imm;
+}
+
+class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {
+ bits<8> vdst;
+ bits<8> src1;
+
+ let Inst{8-0} = 0xf9; // sdwa
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0; // encoding
+}
+
+class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
+ InstSI <P.Outs32, P.Ins32, "", pattern>,
+ VOP <opName>,
+ SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
+ MnemonicAlias<opName#suffix, opName> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = P.Asm32;
+
+ let Size = 4;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SubtargetPredicate = isGCN;
+
+ let VOP2 = 1;
+ let VALU = 1;
+ let Uses = [EXEC];
+
+ let AsmVariantName = AMDGPUAsmVariants.Default;
+
+ VOPProfile Pfl = P;
+}
+
+class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let AsmVariantName = ps.AsmVariantName;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+}
+
+class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+ VOP_SDWA_Pseudo <OpName, P, pattern> {
+ let AsmMatchConverter = "cvtSdwaVOP2";
+}
+
+class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
+ list<dag> ret = !if(P.HasModifiers,
+ [(set P.DstVT:$vdst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+ [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]);
+}
+
+multiclass VOP2Inst <string opName,
+ VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName> {
+
+ def _e32 : VOP2_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+ def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>;
+}
+
+// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst
+multiclass VOP2bInst <string opName,
+ VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName,
+ bit useSGPRInput = !eq(P.NumSrcArgs, 3)> {
+
+ let SchedRW = [Write32Bit, WriteSALU] in {
+ let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
+ def _e32 : VOP2_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>;
+ }
+ def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+ }
+}
+
+multiclass VOP2eInst <string opName,
+ VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName,
+ bit useSGPRInput = !eq(P.NumSrcArgs, 3)> {
+
+ let SchedRW = [Write32Bit] in {
+ let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
+ def _e32 : VOP2_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+ }
+ def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+ }
+}
+
+class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+ field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
+ field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
+ field string Asm32 = "$vdst, $src0, $src1, $imm";
+ field bit HasExt = 0;
+}
+
+def VOP_MADAK_F16 : VOP_MADAK <f16>;
+def VOP_MADAK_F32 : VOP_MADAK <f32>;
+
+class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+ field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
+ field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
+ field string Asm32 = "$vdst, $src0, $imm, $src1";
+ field bit HasExt = 0;
+}
+
+def VOP_MADMK_F16 : VOP_MADMK <f16>;
+def VOP_MADMK_F32 : VOP_MADMK <f32>;
+
+class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
+ let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
+ HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
+ let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+ Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
+ VGPR_32:$src2, // stub argument
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
+ VGPR_32:$src2, // stub argument
+ clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
+ let Asm32 = getAsm32<1, 2, vt>.ret;
+ let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
+ let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret;
+ let HasSrc2 = 0;
+ let HasSrc2Mods = 0;
+ let HasExt = 1;
+}
+
+def VOP_MAC_F16 : VOP_MAC <f16> {
+ // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
+ // 'not a string initializer' error.
+ let Asm64 = getAsm64<1, 2, HasModifiers, f16>.ret;
+}
+
+def VOP_MAC_F32 : VOP_MAC <f32> {
+ // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
+ // 'not a string initializer' error.
+ let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret;
+}
+
+// Write out to vcc or arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+ let Asm32 = "$vdst, vcc, $src0, $src1";
+ let Asm64 = "$vdst, $sdst, $src0, $src1";
+ let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+ let Outs32 = (outs DstRC:$vdst);
+ let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+}
+
+// Write out to vcc or arbitrary SGPR and read in from vcc or
+// arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+ // We use VCSrc_b32 to exclude literal constants, even though the
+ // encoding normally allows them since the implicit VCC use means
+ // using one would always violate the constant bus
+ // restriction. SGPRs are still allowed because it should
+ // technically be possible to use VCC again as src0.
+ let Src0RC32 = VCSrc_b32;
+ let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
+ let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
+ let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+ let Outs32 = (outs DstRC:$vdst);
+ let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+
+ // Suppress src2 implied by type since the 32-bit encoding uses an
+ // implicit VCC use.
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
+
+ let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0,
+ Src1Mod:$src1_modifiers, Src1SDWA:$src1,
+ clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
+
+ let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,
+ Src1Mod:$src1_modifiers, Src1DPP:$src1,
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let HasExt = 1;
+}
+
+// Read in from vcc or arbitrary SGPR
+def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+ let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
+ let Asm32 = "$vdst, $src0, $src1, vcc";
+ let Asm64 = "$vdst, $src0, $src1, $src2";
+ let Outs32 = (outs DstRC:$vdst);
+ let Outs64 = (outs DstRC:$vdst);
+
+ // Suppress src2 implied by type since the 32-bit encoding uses an
+ // implicit VCC use.
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
+}
+
+def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
+ let Outs32 = (outs SReg_32:$vdst);
+ let Outs64 = Outs32;
+ let Ins32 = (ins VGPR_32:$src0, SCSrc_b32:$src1);
+ let Ins64 = Ins32;
+ let Asm32 = " $vdst, $src0, $src1";
+ let Asm64 = Asm32;
+}
+
+def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
+ let Outs32 = (outs VGPR_32:$vdst);
+ let Outs64 = Outs32;
+ let Ins32 = (ins SReg_32:$src0, SCSrc_b32:$src1);
+ let Ins64 = Ins32;
+ let Asm32 = " $vdst, $src0, $src1";
+ let Asm64 = Asm32;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGCN in {
+
+defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
+def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>;
+
+let isCommutable = 1 in {
+defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
+defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>;
+defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
+defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
+defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>;
+defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>;
+defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>;
+defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>;
+defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>;
+defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>;
+defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>;
+defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_I32_I32_I32>;
+defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_I32_I32_I32>;
+defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_I32_I32_I32>;
+defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_I32_I32_I32>;
+defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">;
+defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">;
+defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">;
+defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_I32_I32_I32>;
+defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_I32_I32_I32>;
+defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_I32_I32_I32>;
+
+let Constraints = "$vdst = $src2", DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1 in {
+defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
+}
+
+def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>;
+
+// No patterns so that the scalar instructions are always selected.
+// The scalar versions will be replaced with vector when needed later.
+
+// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
+// but the VI instructions behave the same as the SI versions.
+defm V_ADD_I32 : VOP2bInst <"v_add_i32", VOP2b_I32_I1_I32_I32>;
+defm V_SUB_I32 : VOP2bInst <"v_sub_i32", VOP2b_I32_I1_I32_I32>;
+defm V_SUBREV_I32 : VOP2bInst <"v_subrev_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32">;
+defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1>;
+defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1>;
+defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32">;
+} // End isCommutable = 1
+
+// These are special and do not read the exec mask.
+let isConvergent = 1, Uses = []<Register> in {
+def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
+ [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
+
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
+} // End isConvergent = 1
+
+defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32>;
+defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>;
+defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>;
+defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst"
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>;
+
+} // End SubtargetPredicate = isGCN
+
+
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>;
+defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;
+
+let isCommutable = 1 in {
+defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
+} // End isCommutable = 1
+
+} // End let SubtargetPredicate = SICI
+
+let SubtargetPredicate = isVI in {
+
+def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>;
+defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
+defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
+defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
+defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
+
+let isCommutable = 1 in {
+defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
+defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
+defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
+defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
+def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>;
+defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
+defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
+defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
+defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
+defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
+defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
+defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
+defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
+defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
+defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>;
+
+let Constraints = "$vdst = $src2", DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1 in {
+defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
+}
+} // End isCommutable = 1
+
+} // End SubtargetPredicate = isVI
+
+// Note: 16-bit instructions produce a 0 result in the high 16-bits.
+multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
+
+def : Pat<
+ (op i16:$src0, i16:$src1),
+ (inst $src0, $src1)
+>;
+
+def : Pat<
+ (i32 (zext (op i16:$src0, i16:$src1))),
+ (inst $src0, $src1)
+>;
+
+def : Pat<
+ (i64 (zext (op i16:$src0, i16:$src1))),
+ (REG_SEQUENCE VReg_64,
+ (inst $src0, $src1), sub0,
+ (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+}
+
+multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> {
+
+def : Pat<
+ (op i16:$src0, i16:$src1),
+ (inst $src1, $src0)
+>;
+
+def : Pat<
+ (i32 (zext (op i16:$src0, i16:$src1))),
+ (inst $src1, $src0)
+>;
+
+
+def : Pat<
+ (i64 (zext (op i16:$src0, i16:$src1))),
+ (REG_SEQUENCE VReg_64,
+ (inst $src1, $src0), sub0,
+ (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+}
+
+class ZExt_i16_i1_Pat <SDNode ext> : Pat <
+ (i16 (ext i1:$src)),
+ (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
+>;
+
+let Predicates = [isVI] in {
+
+defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
+defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
+defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>;
+defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>;
+defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>;
+defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>;
+defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>;
+
+def : Pat <
+ (and i16:$src0, i16:$src1),
+ (V_AND_B32_e64 $src0, $src1)
+>;
+
+def : Pat <
+ (or i16:$src0, i16:$src1),
+ (V_OR_B32_e64 $src0, $src1)
+>;
+
+def : Pat <
+ (xor i16:$src0, i16:$src1),
+ (V_XOR_B32_e64 $src0, $src1)
+>;
+
+defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>;
+defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>;
+defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>;
+
+def : ZExt_i16_i1_Pat<zext>;
+def : ZExt_i16_i1_Pat<anyext>;
+
+def : Pat <
+ (i16 (sext i1:$src)),
+ (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)
+>;
+
+} // End Predicates = [isVI]
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+
+multiclass VOP2_Real_si <bits<6> op> {
+ def _si :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2_Real_MADK_si <bits<6> op> {
+ def _si : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2_Real_e32_si <bits<6> op> {
+ def _e32_si :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+}
+
+multiclass VOP2_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> {
+ def _e64_si :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+multiclass VOP2be_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> {
+ def _e64_si :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3be_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI"
+
+defm V_CNDMASK_B32 : VOP2_Real_e32e64_si <0x0>;
+defm V_ADD_F32 : VOP2_Real_e32e64_si <0x3>;
+defm V_SUB_F32 : VOP2_Real_e32e64_si <0x4>;
+defm V_SUBREV_F32 : VOP2_Real_e32e64_si <0x5>;
+defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_si <0x7>;
+defm V_MUL_F32 : VOP2_Real_e32e64_si <0x8>;
+defm V_MUL_I32_I24 : VOP2_Real_e32e64_si <0x9>;
+defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_si <0xa>;
+defm V_MUL_U32_U24 : VOP2_Real_e32e64_si <0xb>;
+defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_si <0xc>;
+defm V_MIN_F32 : VOP2_Real_e32e64_si <0xf>;
+defm V_MAX_F32 : VOP2_Real_e32e64_si <0x10>;
+defm V_MIN_I32 : VOP2_Real_e32e64_si <0x11>;
+defm V_MAX_I32 : VOP2_Real_e32e64_si <0x12>;
+defm V_MIN_U32 : VOP2_Real_e32e64_si <0x13>;
+defm V_MAX_U32 : VOP2_Real_e32e64_si <0x14>;
+defm V_LSHRREV_B32 : VOP2_Real_e32e64_si <0x16>;
+defm V_ASHRREV_I32 : VOP2_Real_e32e64_si <0x18>;
+defm V_LSHLREV_B32 : VOP2_Real_e32e64_si <0x1a>;
+defm V_AND_B32 : VOP2_Real_e32e64_si <0x1b>;
+defm V_OR_B32 : VOP2_Real_e32e64_si <0x1c>;
+defm V_XOR_B32 : VOP2_Real_e32e64_si <0x1d>;
+defm V_MAC_F32 : VOP2_Real_e32e64_si <0x1f>;
+defm V_MADMK_F32 : VOP2_Real_MADK_si <0x20>;
+defm V_MADAK_F32 : VOP2_Real_MADK_si <0x21>;
+defm V_ADD_I32 : VOP2be_Real_e32e64_si <0x25>;
+defm V_SUB_I32 : VOP2be_Real_e32e64_si <0x26>;
+defm V_SUBREV_I32 : VOP2be_Real_e32e64_si <0x27>;
+defm V_ADDC_U32 : VOP2be_Real_e32e64_si <0x28>;
+defm V_SUBB_U32 : VOP2be_Real_e32e64_si <0x29>;
+defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>;
+
+defm V_READLANE_B32 : VOP2_Real_si <0x01>;
+defm V_WRITELANE_B32 : VOP2_Real_si <0x02>;
+
+defm V_MAC_LEGACY_F32 : VOP2_Real_e32e64_si <0x6>;
+defm V_MIN_LEGACY_F32 : VOP2_Real_e32e64_si <0xd>;
+defm V_MAX_LEGACY_F32 : VOP2_Real_e32e64_si <0xe>;
+defm V_LSHR_B32 : VOP2_Real_e32e64_si <0x15>;
+defm V_ASHR_I32 : VOP2_Real_e32e64_si <0x17>;
+defm V_LSHL_B32 : VOP2_Real_e32e64_si <0x19>;
+
+defm V_BFM_B32 : VOP2_Real_e32e64_si <0x1e>;
+defm V_BCNT_U32_B32 : VOP2_Real_e32e64_si <0x22>;
+defm V_MBCNT_LO_U32_B32 : VOP2_Real_e32e64_si <0x23>;
+defm V_MBCNT_HI_U32_B32 : VOP2_Real_e32e64_si <0x24>;
+defm V_LDEXP_F32 : VOP2_Real_e32e64_si <0x2b>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e32e64_si <0x2c>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e32e64_si <0x2d>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e32e64_si <0x2e>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e32e64_si <0x2f>;
+defm V_CVT_PK_U16_U32 : VOP2_Real_e32e64_si <0x30>;
+defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>;
+
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, VOPProfile P = ps.Pfl> :
+ VOP_DPP <ps.OpName, P> {
+ let Defs = ps.Defs;
+ let Uses = ps.Uses;
+ let SchedRW = ps.SchedRW;
+ let hasSideEffects = ps.hasSideEffects;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ bits<8> vdst;
+ bits<8> src1;
+ let Inst{8-0} = 0xfa; //dpp
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0; //encoding
+}
+
+let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+
+multiclass VOP32_Real_vi <bits<10> op> {
+ def _vi :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3e_vi<op, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2_Real_MADK_vi <bits<6> op> {
+ def _vi : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2_Real_e32_vi <bits<6> op> {
+ def _e32_vi :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+}
+
+multiclass VOP2_Real_e64_vi <bits<10> op> {
+ def _e64_vi :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> {
+ def _e64_vi :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
+ VOP2_Real_e32_vi<op>,
+ VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>;
+
+} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
+
+multiclass VOP2_SDWA_Real <bits<6> op> {
+ def _sdwa_vi :
+ VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+}
+
+multiclass VOP2be_Real_e32e64_vi <bits<6> op> :
+ Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {
+ // For now left dpp only for asm/dasm
+ // TODO: add corresponding pseudo
+ def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+}
+
+multiclass VOP2_Real_e32e64_vi <bits<6> op> :
+ Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {
+ // For now left dpp only for asm/dasm
+ // TODO: add corresponding pseudo
+ def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+}
+
+defm V_CNDMASK_B32 : Base_VOP2_Real_e32e64_vi <0x0>;
+defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>;
+defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>;
+defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>;
+defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_vi <0x4>;
+defm V_MUL_F32 : VOP2_Real_e32e64_vi <0x5>;
+defm V_MUL_I32_I24 : VOP2_Real_e32e64_vi <0x6>;
+defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_vi <0x7>;
+defm V_MUL_U32_U24 : VOP2_Real_e32e64_vi <0x8>;
+defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_vi <0x9>;
+defm V_MIN_F32 : VOP2_Real_e32e64_vi <0xa>;
+defm V_MAX_F32 : VOP2_Real_e32e64_vi <0xb>;
+defm V_MIN_I32 : VOP2_Real_e32e64_vi <0xc>;
+defm V_MAX_I32 : VOP2_Real_e32e64_vi <0xd>;
+defm V_MIN_U32 : VOP2_Real_e32e64_vi <0xe>;
+defm V_MAX_U32 : VOP2_Real_e32e64_vi <0xf>;
+defm V_LSHRREV_B32 : VOP2_Real_e32e64_vi <0x10>;
+defm V_ASHRREV_I32 : VOP2_Real_e32e64_vi <0x11>;
+defm V_LSHLREV_B32 : VOP2_Real_e32e64_vi <0x12>;
+defm V_AND_B32 : VOP2_Real_e32e64_vi <0x13>;
+defm V_OR_B32 : VOP2_Real_e32e64_vi <0x14>;
+defm V_XOR_B32 : VOP2_Real_e32e64_vi <0x15>;
+defm V_MAC_F32 : VOP2_Real_e32e64_vi <0x16>;
+defm V_MADMK_F32 : VOP2_Real_MADK_vi <0x17>;
+defm V_MADAK_F32 : VOP2_Real_MADK_vi <0x18>;
+defm V_ADD_I32 : VOP2be_Real_e32e64_vi <0x19>;
+defm V_SUB_I32 : VOP2be_Real_e32e64_vi <0x1a>;
+defm V_SUBREV_I32 : VOP2be_Real_e32e64_vi <0x1b>;
+defm V_ADDC_U32 : VOP2be_Real_e32e64_vi <0x1c>;
+defm V_SUBB_U32 : VOP2be_Real_e32e64_vi <0x1d>;
+defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi <0x1e>;
+
+defm V_READLANE_B32 : VOP32_Real_vi <0x289>;
+defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>;
+
+defm V_BFM_B32 : VOP2_Real_e64_vi <0x293>;
+defm V_BCNT_U32_B32 : VOP2_Real_e64_vi <0x28b>;
+defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64_vi <0x28c>;
+defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64_vi <0x28d>;
+defm V_LDEXP_F32 : VOP2_Real_e64_vi <0x288>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64_vi <0x1f0>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64_vi <0x294>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64_vi <0x295>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64_vi <0x296>;
+defm V_CVT_PK_U16_U32 : VOP2_Real_e64_vi <0x297>;
+defm V_CVT_PK_I16_I32 : VOP2_Real_e64_vi <0x298>;
+
+defm V_ADD_F16 : VOP2_Real_e32e64_vi <0x1f>;
+defm V_SUB_F16 : VOP2_Real_e32e64_vi <0x20>;
+defm V_SUBREV_F16 : VOP2_Real_e32e64_vi <0x21>;
+defm V_MUL_F16 : VOP2_Real_e32e64_vi <0x22>;
+defm V_MAC_F16 : VOP2_Real_e32e64_vi <0x23>;
+defm V_MADMK_F16 : VOP2_Real_MADK_vi <0x24>;
+defm V_MADAK_F16 : VOP2_Real_MADK_vi <0x25>;
+defm V_ADD_U16 : VOP2_Real_e32e64_vi <0x26>;
+defm V_SUB_U16 : VOP2_Real_e32e64_vi <0x27>;
+defm V_SUBREV_U16 : VOP2_Real_e32e64_vi <0x28>;
+defm V_MUL_LO_U16 : VOP2_Real_e32e64_vi <0x29>;
+defm V_LSHLREV_B16 : VOP2_Real_e32e64_vi <0x2a>;
+defm V_LSHRREV_B16 : VOP2_Real_e32e64_vi <0x2b>;
+defm V_ASHRREV_I16 : VOP2_Real_e32e64_vi <0x2c>;
+defm V_MAX_F16 : VOP2_Real_e32e64_vi <0x2d>;
+defm V_MIN_F16 : VOP2_Real_e32e64_vi <0x2e>;
+defm V_MAX_U16 : VOP2_Real_e32e64_vi <0x2f>;
+defm V_MAX_I16 : VOP2_Real_e32e64_vi <0x30>;
+defm V_MIN_U16 : VOP2_Real_e32e64_vi <0x31>;
+defm V_MIN_I16 : VOP2_Real_e32e64_vi <0x32>;
+defm V_LDEXP_F16 : VOP2_Real_e32e64_vi <0x33>;
+
+let SubtargetPredicate = isVI in {
+
+// Aliases to simplify matching of floating-point instructions that
+// are VOP2 on SI and VOP3 on VI.
+class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
+ name#" $dst, $src0, $src1",
+ (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0)
+>, PredicateControl {
+ let UseInstAsmMatchConverter = 0;
+ let AsmVariantName = AMDGPUAsmVariants.VOP3;
+}
+
+def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
+
+} // End SubtargetPredicate = isVI
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td
new file mode 100644
index 0000000..c2a4d4b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -0,0 +1,451 @@
+//===-- VOP3Instructions.td - Vector Instruction Defintions ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VOP3 Classes
+//===----------------------------------------------------------------------===//
+
+class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret3 = [(set P.DstVT:$vdst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+ list<dag> ret2 = [(set P.DstVT:$vdst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+ list<dag> ret1 = [(set P.DstVT:$vdst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))];
+
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
+ list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))];
+ list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0))];
+ list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+ !if(!eq(P.NumSrcArgs, 2), ret2,
+ ret1));
+}
+
+class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
+ VOP3_Pseudo<OpName, P,
+ !if(P.HasModifiers, getVOP3ModPat<P, node>.ret, getVOP3Pat<P, node>.ret),
+ VOP3Only>;
+
+// Special case for v_div_fmas_{f32|f64}, since it seems to be the
+// only VOP instruction that implicitly reads VCC.
+let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
+def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> {
+ let Outs64 = (outs DstRC.RegClass:$vdst);
+}
+def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> {
+ let Outs64 = (outs DstRC.RegClass:$vdst);
+}
+}
+
+class getVOP3VCC<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret =
+ [(set P.DstVT:$vdst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)),
+ (i1 VCC)))];
+}
+
+class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> {
+ // FIXME: Hack to stop printing _e64
+ let Outs64 = (outs DstRC.RegClass:$vdst);
+ let Asm64 = " " # P.Asm64;
+}
+
+class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
+ // v_div_scale_{f32|f64} do not support input modifiers.
+ let HasModifiers = 0;
+ let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+ let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
+}
+
+def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
+ // FIXME: Hack to stop printing _e64
+ let DstRC = RegisterOperand<VGPR_32>;
+}
+
+def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
+ // FIXME: Hack to stop printing _e64
+ let DstRC = RegisterOperand<VReg_64>;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1 in {
+
+def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
+def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUmad_i24>;
+def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUmad_u24>;
+def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
+def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
+def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
+
+let SchedRW = [WriteDoubleAdd] in {
+def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
+def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
+def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
+def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>;
+} // End SchedRW = [WriteDoubleAdd]
+
+let SchedRW = [WriteQuarterRate32] in {
+def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
+def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
+} // End SchedRW = [WriteQuarterRate32]
+
+let Uses = [VCC, EXEC] in {
+// v_div_fmas_f32:
+// result = src0 * src1 + src2
+// if (vcc)
+// result *= 2^32
+//
+def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC,
+ getVOP3VCC<VOP_F32_F32_F32_F32_VCC, AMDGPUdiv_fmas>.ret> {
+ let SchedRW = [WriteFloatFMA];
+}
+// v_div_fmas_f64:
+// result = src0 * src1 + src2
+// if (vcc)
+// result *= 2^64
+//
+def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC,
+ getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> {
+ let SchedRW = [WriteDouble];
+}
+} // End Uses = [VCC, EXEC]
+
+} // End isCommutable = 1
+
+def V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
+def V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
+def V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
+def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
+def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
+def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
+def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
+def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
+def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
+def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
+def V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
+def V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
+def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
+def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
+def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
+def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
+def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_u8>;
+def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_hi_u8>;
+def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_u16>;
+def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
+def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
+
+let SchedRW = [WriteDoubleAdd] in {
+def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
+def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
+} // End SchedRW = [WriteDoubleAdd]
+
+def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
+ let SchedRW = [WriteFloatFMA, WriteSALU];
+ let hasExtraSrcRegAllocReq = 1;
+ let AsmMatchConverter = "";
+}
+
+// Double precision division pre-scale.
+def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
+ let SchedRW = [WriteDouble, WriteSALU];
+ let hasExtraSrcRegAllocReq = 1;
+ let AsmMatchConverter = "";
+}
+
+def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>;
+def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>;
+
+def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
+ let SchedRW = [WriteDouble];
+}
+
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>;
+def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>>;
+def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>>;
+def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+} // End SubtargetPredicate = isSICI
+
+let SubtargetPredicate = isVI in {
+def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
+def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
+def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
+} // End SubtargetPredicate = isVI
+
+
+let SubtargetPredicate = isCIVI in {
+
+def V_MQSAD_U16_U8 : VOP3Inst <"v_mqsad_u16_u8", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
+def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
+
+let isCommutable = 1 in {
+def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3_Profile<VOP_I64_I32_I32_I64>>;
+
+// XXX - Does this set VCC?
+def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3_Profile<VOP_I64_I32_I32_I64>>;
+} // End isCommutable = 1
+
+} // End SubtargetPredicate = isCIVI
+
+
+let SubtargetPredicate = isVI in {
+
+let isCommutable = 1 in {
+
+def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>;
+def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile<VOP_F32_F32_F16>>;
+def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile<VOP_F32_F32_F16_F16>>;
+def V_INTERP_P2_F16 : VOP3Inst <"v_interp_p2_f16", VOP3_Profile<VOP_F16_F32_F16_F32>>;
+def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
+
+def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
+def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
+
+} // End isCommutable = 1
+
+} // End SubtargetPredicate = isVI
+
+let Predicates = [isVI] in {
+
+multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
+ Instruction inst, SDPatternOperator op3> {
+def : Pat<
+ (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
+ (inst i16:$src0, i16:$src1, i16:$src2)
+>;
+
+def : Pat<
+ (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
+ (inst i16:$src0, i16:$src1, i16:$src2)
+>;
+
+def : Pat<
+ (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
+ (REG_SEQUENCE VReg_64,
+ (inst i16:$src0, i16:$src1, i16:$src2), sub0,
+ (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+}
+
+defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>;
+defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>;
+
+} // End Predicates = [isVI]
+
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+
+multiclass VOP3_Real_si<bits<9> op> {
+ def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP3be_Real_si<bits<9> op> {
+ def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI"
+
+defm V_MAD_LEGACY_F32 : VOP3_Real_si <0x140>;
+defm V_MAD_F32 : VOP3_Real_si <0x141>;
+defm V_MAD_I32_I24 : VOP3_Real_si <0x142>;
+defm V_MAD_U32_U24 : VOP3_Real_si <0x143>;
+defm V_CUBEID_F32 : VOP3_Real_si <0x144>;
+defm V_CUBESC_F32 : VOP3_Real_si <0x145>;
+defm V_CUBETC_F32 : VOP3_Real_si <0x146>;
+defm V_CUBEMA_F32 : VOP3_Real_si <0x147>;
+defm V_BFE_U32 : VOP3_Real_si <0x148>;
+defm V_BFE_I32 : VOP3_Real_si <0x149>;
+defm V_BFI_B32 : VOP3_Real_si <0x14a>;
+defm V_FMA_F32 : VOP3_Real_si <0x14b>;
+defm V_FMA_F64 : VOP3_Real_si <0x14c>;
+defm V_LERP_U8 : VOP3_Real_si <0x14d>;
+defm V_ALIGNBIT_B32 : VOP3_Real_si <0x14e>;
+defm V_ALIGNBYTE_B32 : VOP3_Real_si <0x14f>;
+defm V_MULLIT_F32 : VOP3_Real_si <0x150>;
+defm V_MIN3_F32 : VOP3_Real_si <0x151>;
+defm V_MIN3_I32 : VOP3_Real_si <0x152>;
+defm V_MIN3_U32 : VOP3_Real_si <0x153>;
+defm V_MAX3_F32 : VOP3_Real_si <0x154>;
+defm V_MAX3_I32 : VOP3_Real_si <0x155>;
+defm V_MAX3_U32 : VOP3_Real_si <0x156>;
+defm V_MED3_F32 : VOP3_Real_si <0x157>;
+defm V_MED3_I32 : VOP3_Real_si <0x158>;
+defm V_MED3_U32 : VOP3_Real_si <0x159>;
+defm V_SAD_U8 : VOP3_Real_si <0x15a>;
+defm V_SAD_HI_U8 : VOP3_Real_si <0x15b>;
+defm V_SAD_U16 : VOP3_Real_si <0x15c>;
+defm V_SAD_U32 : VOP3_Real_si <0x15d>;
+defm V_CVT_PK_U8_F32 : VOP3_Real_si <0x15e>;
+defm V_DIV_FIXUP_F32 : VOP3_Real_si <0x15f>;
+defm V_DIV_FIXUP_F64 : VOP3_Real_si <0x160>;
+defm V_LSHL_B64 : VOP3_Real_si <0x161>;
+defm V_LSHR_B64 : VOP3_Real_si <0x162>;
+defm V_ASHR_I64 : VOP3_Real_si <0x163>;
+defm V_ADD_F64 : VOP3_Real_si <0x164>;
+defm V_MUL_F64 : VOP3_Real_si <0x165>;
+defm V_MIN_F64 : VOP3_Real_si <0x166>;
+defm V_MAX_F64 : VOP3_Real_si <0x167>;
+defm V_LDEXP_F64 : VOP3_Real_si <0x168>;
+defm V_MUL_LO_U32 : VOP3_Real_si <0x169>;
+defm V_MUL_HI_U32 : VOP3_Real_si <0x16a>;
+defm V_MUL_LO_I32 : VOP3_Real_si <0x16b>;
+defm V_MUL_HI_I32 : VOP3_Real_si <0x16c>;
+defm V_DIV_SCALE_F32 : VOP3be_Real_si <0x16d>;
+defm V_DIV_SCALE_F64 : VOP3be_Real_si <0x16e>;
+defm V_DIV_FMAS_F32 : VOP3_Real_si <0x16f>;
+defm V_DIV_FMAS_F64 : VOP3_Real_si <0x170>;
+defm V_MSAD_U8 : VOP3_Real_si <0x171>;
+defm V_MQSAD_PK_U16_U8 : VOP3_Real_si <0x173>;
+defm V_TRIG_PREOP_F64 : VOP3_Real_si <0x174>;
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+multiclass VOP3_Real_ci<bits<9> op> {
+ def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
+ let AssemblerPredicates = [isCIOnly];
+ let DecoderNamespace = "CI";
+ }
+}
+
+defm V_MQSAD_U16_U8 : VOP3_Real_ci <0x172>;
+defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>;
+defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x174>;
+defm V_MAD_U64_U32 : VOP3_Real_ci <0x176>;
+defm V_MAD_I64_I32 : VOP3_Real_ci <0x177>;
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+
+multiclass VOP3_Real_vi<bits<10> op> {
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP3be_Real_vi<bits<10> op> {
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3be_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
+
+defm V_MQSAD_U16_U8 : VOP3_Real_vi <0x172>;
+defm V_MAD_U64_U32 : VOP3_Real_vi <0x176>;
+defm V_MAD_I64_I32 : VOP3_Real_vi <0x177>;
+
+defm V_MAD_LEGACY_F32 : VOP3_Real_vi <0x1c0>;
+defm V_MAD_F32 : VOP3_Real_vi <0x1c1>;
+defm V_MAD_I32_I24 : VOP3_Real_vi <0x1c2>;
+defm V_MAD_U32_U24 : VOP3_Real_vi <0x1c3>;
+defm V_CUBEID_F32 : VOP3_Real_vi <0x1c4>;
+defm V_CUBESC_F32 : VOP3_Real_vi <0x1c5>;
+defm V_CUBETC_F32 : VOP3_Real_vi <0x1c6>;
+defm V_CUBEMA_F32 : VOP3_Real_vi <0x1c7>;
+defm V_BFE_U32 : VOP3_Real_vi <0x1c8>;
+defm V_BFE_I32 : VOP3_Real_vi <0x1c9>;
+defm V_BFI_B32 : VOP3_Real_vi <0x1ca>;
+defm V_FMA_F32 : VOP3_Real_vi <0x1cb>;
+defm V_FMA_F64 : VOP3_Real_vi <0x1cc>;
+defm V_LERP_U8 : VOP3_Real_vi <0x1cd>;
+defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>;
+defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>;
+defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>;
+defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>;
+defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>;
+defm V_MAX3_F32 : VOP3_Real_vi <0x1d3>;
+defm V_MAX3_I32 : VOP3_Real_vi <0x1d4>;
+defm V_MAX3_U32 : VOP3_Real_vi <0x1d5>;
+defm V_MED3_F32 : VOP3_Real_vi <0x1d6>;
+defm V_MED3_I32 : VOP3_Real_vi <0x1d7>;
+defm V_MED3_U32 : VOP3_Real_vi <0x1d8>;
+defm V_SAD_U8 : VOP3_Real_vi <0x1d9>;
+defm V_SAD_HI_U8 : VOP3_Real_vi <0x1da>;
+defm V_SAD_U16 : VOP3_Real_vi <0x1db>;
+defm V_SAD_U32 : VOP3_Real_vi <0x1dc>;
+defm V_CVT_PK_U8_F32 : VOP3_Real_vi <0x1dd>;
+defm V_DIV_FIXUP_F32 : VOP3_Real_vi <0x1de>;
+defm V_DIV_FIXUP_F64 : VOP3_Real_vi <0x1df>;
+defm V_DIV_SCALE_F32 : VOP3be_Real_vi <0x1e0>;
+defm V_DIV_SCALE_F64 : VOP3be_Real_vi <0x1e1>;
+defm V_DIV_FMAS_F32 : VOP3_Real_vi <0x1e2>;
+defm V_DIV_FMAS_F64 : VOP3_Real_vi <0x1e3>;
+defm V_MSAD_U8 : VOP3_Real_vi <0x1e4>;
+defm V_QSAD_PK_U16_U8 : VOP3_Real_vi <0x1e5>;
+defm V_MQSAD_PK_U16_U8 : VOP3_Real_vi <0x1e6>;
+defm V_MQSAD_U32_U8 : VOP3_Real_vi <0x1e7>;
+
+defm V_MAD_F16 : VOP3_Real_vi <0x1ea>;
+defm V_MAD_U16 : VOP3_Real_vi <0x1eb>;
+defm V_MAD_I16 : VOP3_Real_vi <0x1ec>;
+
+defm V_FMA_F16 : VOP3_Real_vi <0x1ee>;
+defm V_DIV_FIXUP_F16 : VOP3_Real_vi <0x1ef>;
+
+defm V_INTERP_P1LL_F16 : VOP3_Real_vi <0x274>;
+defm V_INTERP_P1LV_F16 : VOP3_Real_vi <0x275>;
+defm V_INTERP_P2_F16 : VOP3_Real_vi <0x276>;
+defm V_ADD_F64 : VOP3_Real_vi <0x280>;
+defm V_MUL_F64 : VOP3_Real_vi <0x281>;
+defm V_MIN_F64 : VOP3_Real_vi <0x282>;
+defm V_MAX_F64 : VOP3_Real_vi <0x283>;
+defm V_LDEXP_F64 : VOP3_Real_vi <0x284>;
+defm V_MUL_LO_U32 : VOP3_Real_vi <0x285>;
+
+// removed from VI as identical to V_MUL_LO_U32
+let isAsmParserOnly = 1 in {
+defm V_MUL_LO_I32 : VOP3_Real_vi <0x285>;
+}
+
+defm V_MUL_HI_U32 : VOP3_Real_vi <0x286>;
+defm V_MUL_HI_I32 : VOP3_Real_vi <0x287>;
+
+defm V_LSHLREV_B64 : VOP3_Real_vi <0x28f>;
+defm V_LSHRREV_B64 : VOP3_Real_vi <0x290>;
+defm V_ASHRREV_I64 : VOP3_Real_vi <0x291>;
+defm V_TRIG_PREOP_F64 : VOP3_Real_vi <0x292>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
new file mode 100644
index 0000000..16a456d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -0,0 +1,1144 @@
+//===-- VOPCInstructions.td - Vector Instruction Defintions ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Encodings
+//===----------------------------------------------------------------------===//
+
+class VOPCe <bits<8> op> : Enc32 {
+ bits<9> src0;
+ bits<8> src1;
+
+ let Inst{8-0} = src0;
+ let Inst{16-9} = src1;
+ let Inst{24-17} = op;
+ let Inst{31-25} = 0x3e;
+}
+
+class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
+ bits<8> src1;
+
+ let Inst{8-0} = 0xf9; // sdwa
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{24-17} = op;
+ let Inst{31-25} = 0x3e; // encoding
+
+ // VOPC disallows dst_sel and dst_unused as they have no effect on destination
+ let Inst{42-40} = SDWA.DWORD;
+ let Inst{44-43} = SDWA.UNUSED_PRESERVE;
+}
+
+//===----------------------------------------------------------------------===//
+// VOPC classes
+//===----------------------------------------------------------------------===//
+
+// VOPC instructions are a special case because for the 32-bit
+// encoding, we want to display the implicit vcc write as if it were
+// an explicit $dst.
+class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt0> :
+ VOPProfile <[i1, vt0, vt1, untyped]> {
+ let Asm32 = "vcc, $src0, $src1";
+ // The destination for 32-bit encoding is implicit.
+ let HasDst32 = 0;
+ let Outs64 = (outs VOPDstS64:$sdst);
+ list<SchedReadWrite> Schedule = sched;
+}
+
+class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[]> :
+ InstSI<(outs), P.Ins32, "", pattern>,
+ VOP <opName>,
+ SIMCInstr<opName#"_e32", SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = P.Asm32;
+
+ let Size = 4;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+
+ let VALU = 1;
+ let VOPC = 1;
+ let Uses = [EXEC];
+ let Defs = [VCC];
+
+ let SubtargetPredicate = isGCN;
+
+ VOPProfile Pfl = P;
+}
+
+class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.PseudoInstr # " " # ps.AsmOperands, []>,
+ SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+}
+
+class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+ VOP_SDWA_Pseudo <OpName, P, pattern> {
+ let AsmMatchConverter = "cvtSdwaVOPC";
+}
+
+// This class is used only with VOPC instructions. Use $sdst for out operand
+class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
+ InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl {
+
+ field bit isCompare;
+ field bit isCommutable;
+
+ let ResultInst =
+ !if (p.HasDst32,
+ !if (!eq(p.NumSrcArgs, 0),
+ // 1 dst, 0 src
+ (inst p.DstRC:$sdst),
+ !if (!eq(p.NumSrcArgs, 1),
+ // 1 dst, 1 src
+ (inst p.DstRC:$sdst, p.Src0RC32:$src0),
+ !if (!eq(p.NumSrcArgs, 2),
+ // 1 dst, 2 src
+ (inst p.DstRC:$sdst, p.Src0RC32:$src0, p.Src1RC32:$src1),
+ // else - unreachable
+ (inst)))),
+ // else
+ !if (!eq(p.NumSrcArgs, 2),
+ // 0 dst, 2 src
+ (inst p.Src0RC32:$src0, p.Src1RC32:$src1),
+ !if (!eq(p.NumSrcArgs, 1),
+ // 0 dst, 1 src
+ (inst p.Src0RC32:$src1),
+ // else
+ // 0 dst, 0 src
+ (inst))));
+
+ let AsmVariantName = AMDGPUAsmVariants.Default;
+ let SubtargetPredicate = AssemblerPredicate;
+}
+
+multiclass VOPC_Pseudos <string opName,
+ VOPC_Profile P,
+ PatLeaf cond = COND_NULL,
+ string revOp = opName,
+ bit DefExec = 0> {
+
+ def _e32 : VOPC_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = P.Schedule;
+ let isConvergent = DefExec;
+ let isCompare = 1;
+ let isCommutable = 1;
+ }
+
+ def _e64 : VOP3_Pseudo<opName, P,
+ !if(P.HasModifiers,
+ [(set i1:$sdst,
+ (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ cond))],
+ [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))])>,
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
+ let Defs = !if(DefExec, [EXEC], []);
+ let SchedRW = P.Schedule;
+ let isCompare = 1;
+ let isCommutable = 1;
+ }
+
+ def _sdwa : VOPC_SDWA_Pseudo <opName, P>,
+ Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = P.Schedule;
+ let isConvergent = DefExec;
+ let isCompare = 1;
+ let isCommutable = 1;
+ }
+}
+
+def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;
+def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>;
+def VOPC_I1_F64_F64 : VOPC_Profile<[WriteDoubleAdd], f64>;
+def VOPC_I1_I16_I16 : VOPC_Profile<[Write32Bit], i16>;
+def VOPC_I1_I32_I32 : VOPC_Profile<[Write32Bit], i32>;
+def VOPC_I1_I64_I64 : VOPC_Profile<[Write64Bit], i64>;
+
+multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>;
+
+multiclass VOPC_F32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>;
+
+multiclass VOPC_F64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>;
+
+multiclass VOPC_I16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I16_I16, cond, revOp, 0>;
+
+multiclass VOPC_I32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
+
+multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+
+multiclass VOPCX_F16 <string opName, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F16_F16, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_F32 <string opName, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F32_F32, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_F64 <string opName, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F64_F64, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_I16 <string opName, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I16_I16, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_I32 <string opName, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I32_I32, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_I64 <string opName, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I64_I64, COND_NULL, revOp, 1>;
+
+
+//===----------------------------------------------------------------------===//
+// Compare instructions
+//===----------------------------------------------------------------------===//
+
+defm V_CMP_F_F32 : VOPC_F32 <"v_cmp_f_f32">;
+defm V_CMP_LT_F32 : VOPC_F32 <"v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">;
+defm V_CMP_EQ_F32 : VOPC_F32 <"v_cmp_eq_f32", COND_OEQ>;
+defm V_CMP_LE_F32 : VOPC_F32 <"v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">;
+defm V_CMP_GT_F32 : VOPC_F32 <"v_cmp_gt_f32", COND_OGT>;
+defm V_CMP_LG_F32 : VOPC_F32 <"v_cmp_lg_f32", COND_ONE>;
+defm V_CMP_GE_F32 : VOPC_F32 <"v_cmp_ge_f32", COND_OGE>;
+defm V_CMP_O_F32 : VOPC_F32 <"v_cmp_o_f32", COND_O>;
+defm V_CMP_U_F32 : VOPC_F32 <"v_cmp_u_f32", COND_UO>;
+defm V_CMP_NGE_F32 : VOPC_F32 <"v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">;
+defm V_CMP_NLG_F32 : VOPC_F32 <"v_cmp_nlg_f32", COND_UEQ>;
+defm V_CMP_NGT_F32 : VOPC_F32 <"v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">;
+defm V_CMP_NLE_F32 : VOPC_F32 <"v_cmp_nle_f32", COND_UGT>;
+defm V_CMP_NEQ_F32 : VOPC_F32 <"v_cmp_neq_f32", COND_UNE>;
+defm V_CMP_NLT_F32 : VOPC_F32 <"v_cmp_nlt_f32", COND_UGE>;
+defm V_CMP_TRU_F32 : VOPC_F32 <"v_cmp_tru_f32">;
+
+defm V_CMPX_F_F32 : VOPCX_F32 <"v_cmpx_f_f32">;
+defm V_CMPX_LT_F32 : VOPCX_F32 <"v_cmpx_lt_f32", "v_cmpx_gt_f32">;
+defm V_CMPX_EQ_F32 : VOPCX_F32 <"v_cmpx_eq_f32">;
+defm V_CMPX_LE_F32 : VOPCX_F32 <"v_cmpx_le_f32", "v_cmpx_ge_f32">;
+defm V_CMPX_GT_F32 : VOPCX_F32 <"v_cmpx_gt_f32">;
+defm V_CMPX_LG_F32 : VOPCX_F32 <"v_cmpx_lg_f32">;
+defm V_CMPX_GE_F32 : VOPCX_F32 <"v_cmpx_ge_f32">;
+defm V_CMPX_O_F32 : VOPCX_F32 <"v_cmpx_o_f32">;
+defm V_CMPX_U_F32 : VOPCX_F32 <"v_cmpx_u_f32">;
+defm V_CMPX_NGE_F32 : VOPCX_F32 <"v_cmpx_nge_f32", "v_cmpx_nle_f32">;
+defm V_CMPX_NLG_F32 : VOPCX_F32 <"v_cmpx_nlg_f32">;
+defm V_CMPX_NGT_F32 : VOPCX_F32 <"v_cmpx_ngt_f32", "v_cmpx_nlt_f32">;
+defm V_CMPX_NLE_F32 : VOPCX_F32 <"v_cmpx_nle_f32">;
+defm V_CMPX_NEQ_F32 : VOPCX_F32 <"v_cmpx_neq_f32">;
+defm V_CMPX_NLT_F32 : VOPCX_F32 <"v_cmpx_nlt_f32">;
+defm V_CMPX_TRU_F32 : VOPCX_F32 <"v_cmpx_tru_f32">;
+
+defm V_CMP_F_F64 : VOPC_F64 <"v_cmp_f_f64">;
+defm V_CMP_LT_F64 : VOPC_F64 <"v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">;
+defm V_CMP_EQ_F64 : VOPC_F64 <"v_cmp_eq_f64", COND_OEQ>;
+defm V_CMP_LE_F64 : VOPC_F64 <"v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">;
+defm V_CMP_GT_F64 : VOPC_F64 <"v_cmp_gt_f64", COND_OGT>;
+defm V_CMP_LG_F64 : VOPC_F64 <"v_cmp_lg_f64", COND_ONE>;
+defm V_CMP_GE_F64 : VOPC_F64 <"v_cmp_ge_f64", COND_OGE>;
+defm V_CMP_O_F64 : VOPC_F64 <"v_cmp_o_f64", COND_O>;
+defm V_CMP_U_F64 : VOPC_F64 <"v_cmp_u_f64", COND_UO>;
+defm V_CMP_NGE_F64 : VOPC_F64 <"v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">;
+defm V_CMP_NLG_F64 : VOPC_F64 <"v_cmp_nlg_f64", COND_UEQ>;
+defm V_CMP_NGT_F64 : VOPC_F64 <"v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">;
+defm V_CMP_NLE_F64 : VOPC_F64 <"v_cmp_nle_f64", COND_UGT>;
+defm V_CMP_NEQ_F64 : VOPC_F64 <"v_cmp_neq_f64", COND_UNE>;
+defm V_CMP_NLT_F64 : VOPC_F64 <"v_cmp_nlt_f64", COND_UGE>;
+defm V_CMP_TRU_F64 : VOPC_F64 <"v_cmp_tru_f64">;
+
+defm V_CMPX_F_F64 : VOPCX_F64 <"v_cmpx_f_f64">;
+defm V_CMPX_LT_F64 : VOPCX_F64 <"v_cmpx_lt_f64", "v_cmpx_gt_f64">;
+defm V_CMPX_EQ_F64 : VOPCX_F64 <"v_cmpx_eq_f64">;
+defm V_CMPX_LE_F64 : VOPCX_F64 <"v_cmpx_le_f64", "v_cmpx_ge_f64">;
+defm V_CMPX_GT_F64 : VOPCX_F64 <"v_cmpx_gt_f64">;
+defm V_CMPX_LG_F64 : VOPCX_F64 <"v_cmpx_lg_f64">;
+defm V_CMPX_GE_F64 : VOPCX_F64 <"v_cmpx_ge_f64">;
+defm V_CMPX_O_F64 : VOPCX_F64 <"v_cmpx_o_f64">;
+defm V_CMPX_U_F64 : VOPCX_F64 <"v_cmpx_u_f64">;
+defm V_CMPX_NGE_F64 : VOPCX_F64 <"v_cmpx_nge_f64", "v_cmpx_nle_f64">;
+defm V_CMPX_NLG_F64 : VOPCX_F64 <"v_cmpx_nlg_f64">;
+defm V_CMPX_NGT_F64 : VOPCX_F64 <"v_cmpx_ngt_f64", "v_cmpx_nlt_f64">;
+defm V_CMPX_NLE_F64 : VOPCX_F64 <"v_cmpx_nle_f64">;
+defm V_CMPX_NEQ_F64 : VOPCX_F64 <"v_cmpx_neq_f64">;
+defm V_CMPX_NLT_F64 : VOPCX_F64 <"v_cmpx_nlt_f64">;
+defm V_CMPX_TRU_F64 : VOPCX_F64 <"v_cmpx_tru_f64">;
+
+let SubtargetPredicate = isSICI in {
+
+defm V_CMPS_F_F32 : VOPC_F32 <"v_cmps_f_f32">;
+defm V_CMPS_LT_F32 : VOPC_F32 <"v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">;
+defm V_CMPS_EQ_F32 : VOPC_F32 <"v_cmps_eq_f32">;
+defm V_CMPS_LE_F32 : VOPC_F32 <"v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">;
+defm V_CMPS_GT_F32 : VOPC_F32 <"v_cmps_gt_f32">;
+defm V_CMPS_LG_F32 : VOPC_F32 <"v_cmps_lg_f32">;
+defm V_CMPS_GE_F32 : VOPC_F32 <"v_cmps_ge_f32">;
+defm V_CMPS_O_F32 : VOPC_F32 <"v_cmps_o_f32">;
+defm V_CMPS_U_F32 : VOPC_F32 <"v_cmps_u_f32">;
+defm V_CMPS_NGE_F32 : VOPC_F32 <"v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">;
+defm V_CMPS_NLG_F32 : VOPC_F32 <"v_cmps_nlg_f32">;
+defm V_CMPS_NGT_F32 : VOPC_F32 <"v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">;
+defm V_CMPS_NLE_F32 : VOPC_F32 <"v_cmps_nle_f32">;
+defm V_CMPS_NEQ_F32 : VOPC_F32 <"v_cmps_neq_f32">;
+defm V_CMPS_NLT_F32 : VOPC_F32 <"v_cmps_nlt_f32">;
+defm V_CMPS_TRU_F32 : VOPC_F32 <"v_cmps_tru_f32">;
+
+defm V_CMPSX_F_F32 : VOPCX_F32 <"v_cmpsx_f_f32">;
+defm V_CMPSX_LT_F32 : VOPCX_F32 <"v_cmpsx_lt_f32", "v_cmpsx_gt_f32">;
+defm V_CMPSX_EQ_F32 : VOPCX_F32 <"v_cmpsx_eq_f32">;
+defm V_CMPSX_LE_F32 : VOPCX_F32 <"v_cmpsx_le_f32", "v_cmpsx_ge_f32">;
+defm V_CMPSX_GT_F32 : VOPCX_F32 <"v_cmpsx_gt_f32">;
+defm V_CMPSX_LG_F32 : VOPCX_F32 <"v_cmpsx_lg_f32">;
+defm V_CMPSX_GE_F32 : VOPCX_F32 <"v_cmpsx_ge_f32">;
+defm V_CMPSX_O_F32 : VOPCX_F32 <"v_cmpsx_o_f32">;
+defm V_CMPSX_U_F32 : VOPCX_F32 <"v_cmpsx_u_f32">;
+defm V_CMPSX_NGE_F32 : VOPCX_F32 <"v_cmpsx_nge_f32", "v_cmpsx_nle_f32">;
+defm V_CMPSX_NLG_F32 : VOPCX_F32 <"v_cmpsx_nlg_f32">;
+defm V_CMPSX_NGT_F32 : VOPCX_F32 <"v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">;
+defm V_CMPSX_NLE_F32 : VOPCX_F32 <"v_cmpsx_nle_f32">;
+defm V_CMPSX_NEQ_F32 : VOPCX_F32 <"v_cmpsx_neq_f32">;
+defm V_CMPSX_NLT_F32 : VOPCX_F32 <"v_cmpsx_nlt_f32">;
+defm V_CMPSX_TRU_F32 : VOPCX_F32 <"v_cmpsx_tru_f32">;
+
+defm V_CMPS_F_F64 : VOPC_F64 <"v_cmps_f_f64">;
+defm V_CMPS_LT_F64 : VOPC_F64 <"v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">;
+defm V_CMPS_EQ_F64 : VOPC_F64 <"v_cmps_eq_f64">;
+defm V_CMPS_LE_F64 : VOPC_F64 <"v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">;
+defm V_CMPS_GT_F64 : VOPC_F64 <"v_cmps_gt_f64">;
+defm V_CMPS_LG_F64 : VOPC_F64 <"v_cmps_lg_f64">;
+defm V_CMPS_GE_F64 : VOPC_F64 <"v_cmps_ge_f64">;
+defm V_CMPS_O_F64 : VOPC_F64 <"v_cmps_o_f64">;
+defm V_CMPS_U_F64 : VOPC_F64 <"v_cmps_u_f64">;
+defm V_CMPS_NGE_F64 : VOPC_F64 <"v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">;
+defm V_CMPS_NLG_F64 : VOPC_F64 <"v_cmps_nlg_f64">;
+defm V_CMPS_NGT_F64 : VOPC_F64 <"v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">;
+defm V_CMPS_NLE_F64 : VOPC_F64 <"v_cmps_nle_f64">;
+defm V_CMPS_NEQ_F64 : VOPC_F64 <"v_cmps_neq_f64">;
+defm V_CMPS_NLT_F64 : VOPC_F64 <"v_cmps_nlt_f64">;
+defm V_CMPS_TRU_F64 : VOPC_F64 <"v_cmps_tru_f64">;
+
+defm V_CMPSX_F_F64 : VOPCX_F64 <"v_cmpsx_f_f64">;
+defm V_CMPSX_LT_F64 : VOPCX_F64 <"v_cmpsx_lt_f64", "v_cmpsx_gt_f64">;
+defm V_CMPSX_EQ_F64 : VOPCX_F64 <"v_cmpsx_eq_f64">;
+defm V_CMPSX_LE_F64 : VOPCX_F64 <"v_cmpsx_le_f64", "v_cmpsx_ge_f64">;
+defm V_CMPSX_GT_F64 : VOPCX_F64 <"v_cmpsx_gt_f64">;
+defm V_CMPSX_LG_F64 : VOPCX_F64 <"v_cmpsx_lg_f64">;
+defm V_CMPSX_GE_F64 : VOPCX_F64 <"v_cmpsx_ge_f64">;
+defm V_CMPSX_O_F64 : VOPCX_F64 <"v_cmpsx_o_f64">;
+defm V_CMPSX_U_F64 : VOPCX_F64 <"v_cmpsx_u_f64">;
+defm V_CMPSX_NGE_F64 : VOPCX_F64 <"v_cmpsx_nge_f64", "v_cmpsx_nle_f64">;
+defm V_CMPSX_NLG_F64 : VOPCX_F64 <"v_cmpsx_nlg_f64">;
+defm V_CMPSX_NGT_F64 : VOPCX_F64 <"v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">;
+defm V_CMPSX_NLE_F64 : VOPCX_F64 <"v_cmpsx_nle_f64">;
+defm V_CMPSX_NEQ_F64 : VOPCX_F64 <"v_cmpsx_neq_f64">;
+defm V_CMPSX_NLT_F64 : VOPCX_F64 <"v_cmpsx_nlt_f64">;
+defm V_CMPSX_TRU_F64 : VOPCX_F64 <"v_cmpsx_tru_f64">;
+
+} // End SubtargetPredicate = isSICI
+
+let SubtargetPredicate = Has16BitInsts in {
+
+defm V_CMP_F_F16 : VOPC_F16 <"v_cmp_f_f16">;
+defm V_CMP_LT_F16 : VOPC_F16 <"v_cmp_lt_f16", COND_OLT, "v_cmp_gt_f16">;
+defm V_CMP_EQ_F16 : VOPC_F16 <"v_cmp_eq_f16", COND_OEQ>;
+defm V_CMP_LE_F16 : VOPC_F16 <"v_cmp_le_f16", COND_OLE, "v_cmp_ge_f16">;
+defm V_CMP_GT_F16 : VOPC_F16 <"v_cmp_gt_f16", COND_OGT>;
+defm V_CMP_LG_F16 : VOPC_F16 <"v_cmp_lg_f16", COND_ONE>;
+defm V_CMP_GE_F16 : VOPC_F16 <"v_cmp_ge_f16", COND_OGE>;
+defm V_CMP_O_F16 : VOPC_F16 <"v_cmp_o_f16", COND_O>;
+defm V_CMP_U_F16 : VOPC_F16 <"v_cmp_u_f16", COND_UO>;
+defm V_CMP_NGE_F16 : VOPC_F16 <"v_cmp_nge_f16", COND_ULT, "v_cmp_nle_f16">;
+defm V_CMP_NLG_F16 : VOPC_F16 <"v_cmp_nlg_f16", COND_UEQ>;
+defm V_CMP_NGT_F16 : VOPC_F16 <"v_cmp_ngt_f16", COND_ULE, "v_cmp_nlt_f16">;
+defm V_CMP_NLE_F16 : VOPC_F16 <"v_cmp_nle_f16", COND_UGT>;
+defm V_CMP_NEQ_F16 : VOPC_F16 <"v_cmp_neq_f16", COND_UNE>;
+defm V_CMP_NLT_F16 : VOPC_F16 <"v_cmp_nlt_f16", COND_UGE>;
+defm V_CMP_TRU_F16 : VOPC_F16 <"v_cmp_tru_f16">;
+
+defm V_CMPX_F_F16 : VOPCX_F16 <"v_cmpx_f_f16">;
+defm V_CMPX_LT_F16 : VOPCX_F16 <"v_cmpx_lt_f16", "v_cmpx_gt_f16">;
+defm V_CMPX_EQ_F16 : VOPCX_F16 <"v_cmpx_eq_f16">;
+defm V_CMPX_LE_F16 : VOPCX_F16 <"v_cmpx_le_f16", "v_cmpx_ge_f16">;
+defm V_CMPX_GT_F16 : VOPCX_F16 <"v_cmpx_gt_f16">;
+defm V_CMPX_LG_F16 : VOPCX_F16 <"v_cmpx_lg_f16">;
+defm V_CMPX_GE_F16 : VOPCX_F16 <"v_cmpx_ge_f16">;
+defm V_CMPX_O_F16 : VOPCX_F16 <"v_cmpx_o_f16">;
+defm V_CMPX_U_F16 : VOPCX_F16 <"v_cmpx_u_f16">;
+defm V_CMPX_NGE_F16 : VOPCX_F16 <"v_cmpx_nge_f16", "v_cmpx_nle_f16">;
+defm V_CMPX_NLG_F16 : VOPCX_F16 <"v_cmpx_nlg_f16">;
+defm V_CMPX_NGT_F16 : VOPCX_F16 <"v_cmpx_ngt_f16", "v_cmpx_nlt_f16">;
+defm V_CMPX_NLE_F16 : VOPCX_F16 <"v_cmpx_nle_f16">;
+defm V_CMPX_NEQ_F16 : VOPCX_F16 <"v_cmpx_neq_f16">;
+defm V_CMPX_NLT_F16 : VOPCX_F16 <"v_cmpx_nlt_f16">;
+defm V_CMPX_TRU_F16 : VOPCX_F16 <"v_cmpx_tru_f16">;
+
+defm V_CMP_F_I16 : VOPC_I16 <"v_cmp_f_i16">;
+defm V_CMP_LT_I16 : VOPC_I16 <"v_cmp_lt_i16", COND_SLT, "v_cmp_gt_i16">;
+defm V_CMP_EQ_I16 : VOPC_I16 <"v_cmp_eq_i16">;
+defm V_CMP_LE_I16 : VOPC_I16 <"v_cmp_le_i16", COND_SLE, "v_cmp_ge_i16">;
+defm V_CMP_GT_I16 : VOPC_I16 <"v_cmp_gt_i16", COND_SGT>;
+defm V_CMP_NE_I16 : VOPC_I16 <"v_cmp_ne_i16">;
+defm V_CMP_GE_I16 : VOPC_I16 <"v_cmp_ge_i16", COND_SGE>;
+defm V_CMP_T_I16 : VOPC_I16 <"v_cmp_t_i16">;
+
+defm V_CMP_F_U16 : VOPC_I16 <"v_cmp_f_u16">;
+defm V_CMP_LT_U16 : VOPC_I16 <"v_cmp_lt_u16", COND_ULT, "v_cmp_gt_u16">;
+defm V_CMP_EQ_U16 : VOPC_I16 <"v_cmp_eq_u16", COND_EQ>;
+defm V_CMP_LE_U16 : VOPC_I16 <"v_cmp_le_u16", COND_ULE, "v_cmp_ge_u16">;
+defm V_CMP_GT_U16 : VOPC_I16 <"v_cmp_gt_u16", COND_UGT>;
+defm V_CMP_NE_U16 : VOPC_I16 <"v_cmp_ne_u16", COND_NE>;
+defm V_CMP_GE_U16 : VOPC_I16 <"v_cmp_ge_u16", COND_UGE>;
+defm V_CMP_T_U16 : VOPC_I16 <"v_cmp_t_u16">;
+
+defm V_CMPX_F_I16 : VOPCX_I16 <"v_cmpx_f_i16">;
+defm V_CMPX_LT_I16 : VOPCX_I16 <"v_cmpx_lt_i16", "v_cmpx_gt_i16">;
+defm V_CMPX_EQ_I16 : VOPCX_I16 <"v_cmpx_eq_i16">;
+defm V_CMPX_LE_I16 : VOPCX_I16 <"v_cmpx_le_i16", "v_cmpx_ge_i16">;
+defm V_CMPX_GT_I16 : VOPCX_I16 <"v_cmpx_gt_i16">;
+defm V_CMPX_NE_I16 : VOPCX_I16 <"v_cmpx_ne_i16">;
+defm V_CMPX_GE_I16 : VOPCX_I16 <"v_cmpx_ge_i16">;
+defm V_CMPX_T_I16 : VOPCX_I16 <"v_cmpx_t_i16">;
+defm V_CMPX_F_U16 : VOPCX_I16 <"v_cmpx_f_u16">;
+
+defm V_CMPX_LT_U16 : VOPCX_I16 <"v_cmpx_lt_u16", "v_cmpx_gt_u16">;
+defm V_CMPX_EQ_U16 : VOPCX_I16 <"v_cmpx_eq_u16">;
+defm V_CMPX_LE_U16 : VOPCX_I16 <"v_cmpx_le_u16", "v_cmpx_ge_u16">;
+defm V_CMPX_GT_U16 : VOPCX_I16 <"v_cmpx_gt_u16">;
+defm V_CMPX_NE_U16 : VOPCX_I16 <"v_cmpx_ne_u16">;
+defm V_CMPX_GE_U16 : VOPCX_I16 <"v_cmpx_ge_u16">;
+defm V_CMPX_T_U16 : VOPCX_I16 <"v_cmpx_t_u16">;
+
+} // End SubtargetPredicate = Has16BitInsts
+
+defm V_CMP_F_I32 : VOPC_I32 <"v_cmp_f_i32">;
+defm V_CMP_LT_I32 : VOPC_I32 <"v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">;
+defm V_CMP_EQ_I32 : VOPC_I32 <"v_cmp_eq_i32">;
+defm V_CMP_LE_I32 : VOPC_I32 <"v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">;
+defm V_CMP_GT_I32 : VOPC_I32 <"v_cmp_gt_i32", COND_SGT>;
+defm V_CMP_NE_I32 : VOPC_I32 <"v_cmp_ne_i32">;
+defm V_CMP_GE_I32 : VOPC_I32 <"v_cmp_ge_i32", COND_SGE>;
+defm V_CMP_T_I32 : VOPC_I32 <"v_cmp_t_i32">;
+
+defm V_CMPX_F_I32 : VOPCX_I32 <"v_cmpx_f_i32">;
+defm V_CMPX_LT_I32 : VOPCX_I32 <"v_cmpx_lt_i32", "v_cmpx_gt_i32">;
+defm V_CMPX_EQ_I32 : VOPCX_I32 <"v_cmpx_eq_i32">;
+defm V_CMPX_LE_I32 : VOPCX_I32 <"v_cmpx_le_i32", "v_cmpx_ge_i32">;
+defm V_CMPX_GT_I32 : VOPCX_I32 <"v_cmpx_gt_i32">;
+defm V_CMPX_NE_I32 : VOPCX_I32 <"v_cmpx_ne_i32">;
+defm V_CMPX_GE_I32 : VOPCX_I32 <"v_cmpx_ge_i32">;
+defm V_CMPX_T_I32 : VOPCX_I32 <"v_cmpx_t_i32">;
+
+defm V_CMP_F_I64 : VOPC_I64 <"v_cmp_f_i64">;
+defm V_CMP_LT_I64 : VOPC_I64 <"v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">;
+defm V_CMP_EQ_I64 : VOPC_I64 <"v_cmp_eq_i64">;
+defm V_CMP_LE_I64 : VOPC_I64 <"v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">;
+defm V_CMP_GT_I64 : VOPC_I64 <"v_cmp_gt_i64", COND_SGT>;
+defm V_CMP_NE_I64 : VOPC_I64 <"v_cmp_ne_i64">;
+defm V_CMP_GE_I64 : VOPC_I64 <"v_cmp_ge_i64", COND_SGE>;
+defm V_CMP_T_I64 : VOPC_I64 <"v_cmp_t_i64">;
+
+defm V_CMPX_F_I64 : VOPCX_I64 <"v_cmpx_f_i64">;
+defm V_CMPX_LT_I64 : VOPCX_I64 <"v_cmpx_lt_i64", "v_cmpx_gt_i64">;
+defm V_CMPX_EQ_I64 : VOPCX_I64 <"v_cmpx_eq_i64">;
+defm V_CMPX_LE_I64 : VOPCX_I64 <"v_cmpx_le_i64", "v_cmpx_ge_i64">;
+defm V_CMPX_GT_I64 : VOPCX_I64 <"v_cmpx_gt_i64">;
+defm V_CMPX_NE_I64 : VOPCX_I64 <"v_cmpx_ne_i64">;
+defm V_CMPX_GE_I64 : VOPCX_I64 <"v_cmpx_ge_i64">;
+defm V_CMPX_T_I64 : VOPCX_I64 <"v_cmpx_t_i64">;
+
+defm V_CMP_F_U32 : VOPC_I32 <"v_cmp_f_u32">;
+defm V_CMP_LT_U32 : VOPC_I32 <"v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">;
+defm V_CMP_EQ_U32 : VOPC_I32 <"v_cmp_eq_u32", COND_EQ>;
+defm V_CMP_LE_U32 : VOPC_I32 <"v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">;
+defm V_CMP_GT_U32 : VOPC_I32 <"v_cmp_gt_u32", COND_UGT>;
+defm V_CMP_NE_U32 : VOPC_I32 <"v_cmp_ne_u32", COND_NE>;
+defm V_CMP_GE_U32 : VOPC_I32 <"v_cmp_ge_u32", COND_UGE>;
+defm V_CMP_T_U32 : VOPC_I32 <"v_cmp_t_u32">;
+
+defm V_CMPX_F_U32 : VOPCX_I32 <"v_cmpx_f_u32">;
+defm V_CMPX_LT_U32 : VOPCX_I32 <"v_cmpx_lt_u32", "v_cmpx_gt_u32">;
+defm V_CMPX_EQ_U32 : VOPCX_I32 <"v_cmpx_eq_u32">;
+defm V_CMPX_LE_U32 : VOPCX_I32 <"v_cmpx_le_u32", "v_cmpx_le_u32">;
+defm V_CMPX_GT_U32 : VOPCX_I32 <"v_cmpx_gt_u32">;
+defm V_CMPX_NE_U32 : VOPCX_I32 <"v_cmpx_ne_u32">;
+defm V_CMPX_GE_U32 : VOPCX_I32 <"v_cmpx_ge_u32">;
+defm V_CMPX_T_U32 : VOPCX_I32 <"v_cmpx_t_u32">;
+
+defm V_CMP_F_U64 : VOPC_I64 <"v_cmp_f_u64">;
+defm V_CMP_LT_U64 : VOPC_I64 <"v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">;
+defm V_CMP_EQ_U64 : VOPC_I64 <"v_cmp_eq_u64", COND_EQ>;
+defm V_CMP_LE_U64 : VOPC_I64 <"v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">;
+defm V_CMP_GT_U64 : VOPC_I64 <"v_cmp_gt_u64", COND_UGT>;
+defm V_CMP_NE_U64 : VOPC_I64 <"v_cmp_ne_u64", COND_NE>;
+defm V_CMP_GE_U64 : VOPC_I64 <"v_cmp_ge_u64", COND_UGE>;
+defm V_CMP_T_U64 : VOPC_I64 <"v_cmp_t_u64">;
+
+defm V_CMPX_F_U64 : VOPCX_I64 <"v_cmpx_f_u64">;
+defm V_CMPX_LT_U64 : VOPCX_I64 <"v_cmpx_lt_u64", "v_cmpx_gt_u64">;
+defm V_CMPX_EQ_U64 : VOPCX_I64 <"v_cmpx_eq_u64">;
+defm V_CMPX_LE_U64 : VOPCX_I64 <"v_cmpx_le_u64", "v_cmpx_ge_u64">;
+defm V_CMPX_GT_U64 : VOPCX_I64 <"v_cmpx_gt_u64">;
+defm V_CMPX_NE_U64 : VOPCX_I64 <"v_cmpx_ne_u64">;
+defm V_CMPX_GE_U64 : VOPCX_I64 <"v_cmpx_ge_u64">;
+defm V_CMPX_T_U64 : VOPCX_I64 <"v_cmpx_t_u64">;
+
+//===----------------------------------------------------------------------===//
+// Class instructions
+//===----------------------------------------------------------------------===//
+
+class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
+ VOPC_Profile<sched, vt, i32> {
+ let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+ let Asm64 = "$sdst, $src0_modifiers, $src1";
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
+ clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
+ let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
+ let HasSrc1Mods = 0;
+ let HasClamp = 0;
+ let HasOMod = 0;
+}
+
+class getVOPCClassPat64 <VOPProfile P> {
+ list<dag> ret =
+ [(set i1:$sdst,
+ (AMDGPUfp_class
+ (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)),
+ P.Src1VT:$src1))];
+}
+
+// Special case for class instructions which only have modifiers on
+// the 1st source operand.
+multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {
+ def _e32 : VOPC_Pseudo <opName, p> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = p.Schedule;
+ let isConvergent = DefExec;
+ }
+
+ def _e64 : VOP3_Pseudo<opName, p, getVOPCClassPat64<p>.ret> {
+ let Defs = !if(DefExec, [EXEC], []);
+ let SchedRW = p.Schedule;
+ }
+
+ def _sdwa : VOPC_SDWA_Pseudo <opName, p> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = p.Schedule;
+ let isConvergent = DefExec;
+ }
+}
+
+def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;
+def VOPC_I1_F32_I32 : VOPC_Class_Profile<[Write32Bit], f32>;
+def VOPC_I1_F64_I32 : VOPC_Class_Profile<[WriteDoubleAdd], f64>;
+
+multiclass VOPC_CLASS_F16 <string opName> :
+ VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>;
+
+multiclass VOPCX_CLASS_F16 <string opName> :
+ VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>;
+
+multiclass VOPC_CLASS_F32 <string opName> :
+ VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>;
+
+multiclass VOPCX_CLASS_F32 <string opName> :
+ VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>;
+
+multiclass VOPC_CLASS_F64 <string opName> :
+ VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 0>;
+
+multiclass VOPCX_CLASS_F64 <string opName> :
+ VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 1>;
+
+defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">;
+defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">;
+defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">;
+defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">;
+defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">;
+defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
+
+//===----------------------------------------------------------------------===//
+// V_ICMPIntrinsic Pattern.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isGCN] in {
+
+class ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
+ (AMDGPUsetcc vt:$src0, vt:$src1, cond),
+ (inst $src0, $src1)
+>;
+
+def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>;
+def : ICMP_Pattern <COND_NE, V_CMP_NE_U32_e64, i32>;
+def : ICMP_Pattern <COND_UGT, V_CMP_GT_U32_e64, i32>;
+def : ICMP_Pattern <COND_UGE, V_CMP_GE_U32_e64, i32>;
+def : ICMP_Pattern <COND_ULT, V_CMP_LT_U32_e64, i32>;
+def : ICMP_Pattern <COND_ULE, V_CMP_LE_U32_e64, i32>;
+def : ICMP_Pattern <COND_SGT, V_CMP_GT_I32_e64, i32>;
+def : ICMP_Pattern <COND_SGE, V_CMP_GE_I32_e64, i32>;
+def : ICMP_Pattern <COND_SLT, V_CMP_LT_I32_e64, i32>;
+def : ICMP_Pattern <COND_SLE, V_CMP_LE_I32_e64, i32>;
+
+def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U64_e64, i64>;
+def : ICMP_Pattern <COND_NE, V_CMP_NE_U64_e64, i64>;
+def : ICMP_Pattern <COND_UGT, V_CMP_GT_U64_e64, i64>;
+def : ICMP_Pattern <COND_UGE, V_CMP_GE_U64_e64, i64>;
+def : ICMP_Pattern <COND_ULT, V_CMP_LT_U64_e64, i64>;
+def : ICMP_Pattern <COND_ULE, V_CMP_LE_U64_e64, i64>;
+def : ICMP_Pattern <COND_SGT, V_CMP_GT_I64_e64, i64>;
+def : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>;
+def : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>;
+def : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>;
+
+class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
+ (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+ (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
+ (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+ DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
+def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F32_e64, f32>;
+def : FCMP_Pattern <COND_OGT, V_CMP_GT_F32_e64, f32>;
+def : FCMP_Pattern <COND_OGE, V_CMP_GE_F32_e64, f32>;
+def : FCMP_Pattern <COND_OLT, V_CMP_LT_F32_e64, f32>;
+def : FCMP_Pattern <COND_OLE, V_CMP_LE_F32_e64, f32>;
+
+def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F64_e64, f64>;
+def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F64_e64, f64>;
+def : FCMP_Pattern <COND_OGT, V_CMP_GT_F64_e64, f64>;
+def : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>;
+def : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>;
+def : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>;
+
+def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>;
+def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>;
+def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>;
+def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F32_e64, f32>;
+def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F32_e64, f32>;
+def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F32_e64, f32>;
+
+def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F64_e64, f64>;
+def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F64_e64, f64>;
+def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F64_e64, f64>;
+def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>;
+def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
+def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
+
+} // End Predicates = [isGCN]
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+multiclass VOPC_Real_si <bits<9> op> {
+ let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+ def _e32_si :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOPCe<op{7-0}>;
+
+ def _e64_si :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3a_si <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3
+ // Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
+ }
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
+ !cast<Instruction>(NAME#"_e32_si")> {
+ let AssemblerPredicate = isSICI;
+ }
+}
+
+defm V_CMP_F_F32 : VOPC_Real_si <0x0>;
+defm V_CMP_LT_F32 : VOPC_Real_si <0x1>;
+defm V_CMP_EQ_F32 : VOPC_Real_si <0x2>;
+defm V_CMP_LE_F32 : VOPC_Real_si <0x3>;
+defm V_CMP_GT_F32 : VOPC_Real_si <0x4>;
+defm V_CMP_LG_F32 : VOPC_Real_si <0x5>;
+defm V_CMP_GE_F32 : VOPC_Real_si <0x6>;
+defm V_CMP_O_F32 : VOPC_Real_si <0x7>;
+defm V_CMP_U_F32 : VOPC_Real_si <0x8>;
+defm V_CMP_NGE_F32 : VOPC_Real_si <0x9>;
+defm V_CMP_NLG_F32 : VOPC_Real_si <0xa>;
+defm V_CMP_NGT_F32 : VOPC_Real_si <0xb>;
+defm V_CMP_NLE_F32 : VOPC_Real_si <0xc>;
+defm V_CMP_NEQ_F32 : VOPC_Real_si <0xd>;
+defm V_CMP_NLT_F32 : VOPC_Real_si <0xe>;
+defm V_CMP_TRU_F32 : VOPC_Real_si <0xf>;
+
+defm V_CMPX_F_F32 : VOPC_Real_si <0x10>;
+defm V_CMPX_LT_F32 : VOPC_Real_si <0x11>;
+defm V_CMPX_EQ_F32 : VOPC_Real_si <0x12>;
+defm V_CMPX_LE_F32 : VOPC_Real_si <0x13>;
+defm V_CMPX_GT_F32 : VOPC_Real_si <0x14>;
+defm V_CMPX_LG_F32 : VOPC_Real_si <0x15>;
+defm V_CMPX_GE_F32 : VOPC_Real_si <0x16>;
+defm V_CMPX_O_F32 : VOPC_Real_si <0x17>;
+defm V_CMPX_U_F32 : VOPC_Real_si <0x18>;
+defm V_CMPX_NGE_F32 : VOPC_Real_si <0x19>;
+defm V_CMPX_NLG_F32 : VOPC_Real_si <0x1a>;
+defm V_CMPX_NGT_F32 : VOPC_Real_si <0x1b>;
+defm V_CMPX_NLE_F32 : VOPC_Real_si <0x1c>;
+defm V_CMPX_NEQ_F32 : VOPC_Real_si <0x1d>;
+defm V_CMPX_NLT_F32 : VOPC_Real_si <0x1e>;
+defm V_CMPX_TRU_F32 : VOPC_Real_si <0x1f>;
+
+defm V_CMP_F_F64 : VOPC_Real_si <0x20>;
+defm V_CMP_LT_F64 : VOPC_Real_si <0x21>;
+defm V_CMP_EQ_F64 : VOPC_Real_si <0x22>;
+defm V_CMP_LE_F64 : VOPC_Real_si <0x23>;
+defm V_CMP_GT_F64 : VOPC_Real_si <0x24>;
+defm V_CMP_LG_F64 : VOPC_Real_si <0x25>;
+defm V_CMP_GE_F64 : VOPC_Real_si <0x26>;
+defm V_CMP_O_F64 : VOPC_Real_si <0x27>;
+defm V_CMP_U_F64 : VOPC_Real_si <0x28>;
+defm V_CMP_NGE_F64 : VOPC_Real_si <0x29>;
+defm V_CMP_NLG_F64 : VOPC_Real_si <0x2a>;
+defm V_CMP_NGT_F64 : VOPC_Real_si <0x2b>;
+defm V_CMP_NLE_F64 : VOPC_Real_si <0x2c>;
+defm V_CMP_NEQ_F64 : VOPC_Real_si <0x2d>;
+defm V_CMP_NLT_F64 : VOPC_Real_si <0x2e>;
+defm V_CMP_TRU_F64 : VOPC_Real_si <0x2f>;
+
+defm V_CMPX_F_F64 : VOPC_Real_si <0x30>;
+defm V_CMPX_LT_F64 : VOPC_Real_si <0x31>;
+defm V_CMPX_EQ_F64 : VOPC_Real_si <0x32>;
+defm V_CMPX_LE_F64 : VOPC_Real_si <0x33>;
+defm V_CMPX_GT_F64 : VOPC_Real_si <0x34>;
+defm V_CMPX_LG_F64 : VOPC_Real_si <0x35>;
+defm V_CMPX_GE_F64 : VOPC_Real_si <0x36>;
+defm V_CMPX_O_F64 : VOPC_Real_si <0x37>;
+defm V_CMPX_U_F64 : VOPC_Real_si <0x38>;
+defm V_CMPX_NGE_F64 : VOPC_Real_si <0x39>;
+defm V_CMPX_NLG_F64 : VOPC_Real_si <0x3a>;
+defm V_CMPX_NGT_F64 : VOPC_Real_si <0x3b>;
+defm V_CMPX_NLE_F64 : VOPC_Real_si <0x3c>;
+defm V_CMPX_NEQ_F64 : VOPC_Real_si <0x3d>;
+defm V_CMPX_NLT_F64 : VOPC_Real_si <0x3e>;
+defm V_CMPX_TRU_F64 : VOPC_Real_si <0x3f>;
+
+defm V_CMPS_F_F32 : VOPC_Real_si <0x40>;
+defm V_CMPS_LT_F32 : VOPC_Real_si <0x41>;
+defm V_CMPS_EQ_F32 : VOPC_Real_si <0x42>;
+defm V_CMPS_LE_F32 : VOPC_Real_si <0x43>;
+defm V_CMPS_GT_F32 : VOPC_Real_si <0x44>;
+defm V_CMPS_LG_F32 : VOPC_Real_si <0x45>;
+defm V_CMPS_GE_F32 : VOPC_Real_si <0x46>;
+defm V_CMPS_O_F32 : VOPC_Real_si <0x47>;
+defm V_CMPS_U_F32 : VOPC_Real_si <0x48>;
+defm V_CMPS_NGE_F32 : VOPC_Real_si <0x49>;
+defm V_CMPS_NLG_F32 : VOPC_Real_si <0x4a>;
+defm V_CMPS_NGT_F32 : VOPC_Real_si <0x4b>;
+defm V_CMPS_NLE_F32 : VOPC_Real_si <0x4c>;
+defm V_CMPS_NEQ_F32 : VOPC_Real_si <0x4d>;
+defm V_CMPS_NLT_F32 : VOPC_Real_si <0x4e>;
+defm V_CMPS_TRU_F32 : VOPC_Real_si <0x4f>;
+
+defm V_CMPSX_F_F32 : VOPC_Real_si <0x50>;
+defm V_CMPSX_LT_F32 : VOPC_Real_si <0x51>;
+defm V_CMPSX_EQ_F32 : VOPC_Real_si <0x52>;
+defm V_CMPSX_LE_F32 : VOPC_Real_si <0x53>;
+defm V_CMPSX_GT_F32 : VOPC_Real_si <0x54>;
+defm V_CMPSX_LG_F32 : VOPC_Real_si <0x55>;
+defm V_CMPSX_GE_F32 : VOPC_Real_si <0x56>;
+defm V_CMPSX_O_F32 : VOPC_Real_si <0x57>;
+defm V_CMPSX_U_F32 : VOPC_Real_si <0x58>;
+defm V_CMPSX_NGE_F32 : VOPC_Real_si <0x59>;
+defm V_CMPSX_NLG_F32 : VOPC_Real_si <0x5a>;
+defm V_CMPSX_NGT_F32 : VOPC_Real_si <0x5b>;
+defm V_CMPSX_NLE_F32 : VOPC_Real_si <0x5c>;
+defm V_CMPSX_NEQ_F32 : VOPC_Real_si <0x5d>;
+defm V_CMPSX_NLT_F32 : VOPC_Real_si <0x5e>;
+defm V_CMPSX_TRU_F32 : VOPC_Real_si <0x5f>;
+
+defm V_CMPS_F_F64 : VOPC_Real_si <0x60>;
+defm V_CMPS_LT_F64 : VOPC_Real_si <0x61>;
+defm V_CMPS_EQ_F64 : VOPC_Real_si <0x62>;
+defm V_CMPS_LE_F64 : VOPC_Real_si <0x63>;
+defm V_CMPS_GT_F64 : VOPC_Real_si <0x64>;
+defm V_CMPS_LG_F64 : VOPC_Real_si <0x65>;
+defm V_CMPS_GE_F64 : VOPC_Real_si <0x66>;
+defm V_CMPS_O_F64 : VOPC_Real_si <0x67>;
+defm V_CMPS_U_F64 : VOPC_Real_si <0x68>;
+defm V_CMPS_NGE_F64 : VOPC_Real_si <0x69>;
+defm V_CMPS_NLG_F64 : VOPC_Real_si <0x6a>;
+defm V_CMPS_NGT_F64 : VOPC_Real_si <0x6b>;
+defm V_CMPS_NLE_F64 : VOPC_Real_si <0x6c>;
+defm V_CMPS_NEQ_F64 : VOPC_Real_si <0x6d>;
+defm V_CMPS_NLT_F64 : VOPC_Real_si <0x6e>;
+defm V_CMPS_TRU_F64 : VOPC_Real_si <0x6f>;
+
+defm V_CMPSX_F_F64 : VOPC_Real_si <0x70>;
+defm V_CMPSX_LT_F64 : VOPC_Real_si <0x71>;
+defm V_CMPSX_EQ_F64 : VOPC_Real_si <0x72>;
+defm V_CMPSX_LE_F64 : VOPC_Real_si <0x73>;
+defm V_CMPSX_GT_F64 : VOPC_Real_si <0x74>;
+defm V_CMPSX_LG_F64 : VOPC_Real_si <0x75>;
+defm V_CMPSX_GE_F64 : VOPC_Real_si <0x76>;
+defm V_CMPSX_O_F64 : VOPC_Real_si <0x77>;
+defm V_CMPSX_U_F64 : VOPC_Real_si <0x78>;
+defm V_CMPSX_NGE_F64 : VOPC_Real_si <0x79>;
+defm V_CMPSX_NLG_F64 : VOPC_Real_si <0x7a>;
+defm V_CMPSX_NGT_F64 : VOPC_Real_si <0x7b>;
+defm V_CMPSX_NLE_F64 : VOPC_Real_si <0x7c>;
+defm V_CMPSX_NEQ_F64 : VOPC_Real_si <0x7d>;
+defm V_CMPSX_NLT_F64 : VOPC_Real_si <0x7e>;
+defm V_CMPSX_TRU_F64 : VOPC_Real_si <0x7f>;
+
+defm V_CMP_F_I32 : VOPC_Real_si <0x80>;
+defm V_CMP_LT_I32 : VOPC_Real_si <0x81>;
+defm V_CMP_EQ_I32 : VOPC_Real_si <0x82>;
+defm V_CMP_LE_I32 : VOPC_Real_si <0x83>;
+defm V_CMP_GT_I32 : VOPC_Real_si <0x84>;
+defm V_CMP_NE_I32 : VOPC_Real_si <0x85>;
+defm V_CMP_GE_I32 : VOPC_Real_si <0x86>;
+defm V_CMP_T_I32 : VOPC_Real_si <0x87>;
+
+defm V_CMPX_F_I32 : VOPC_Real_si <0x90>;
+defm V_CMPX_LT_I32 : VOPC_Real_si <0x91>;
+defm V_CMPX_EQ_I32 : VOPC_Real_si <0x92>;
+defm V_CMPX_LE_I32 : VOPC_Real_si <0x93>;
+defm V_CMPX_GT_I32 : VOPC_Real_si <0x94>;
+defm V_CMPX_NE_I32 : VOPC_Real_si <0x95>;
+defm V_CMPX_GE_I32 : VOPC_Real_si <0x96>;
+defm V_CMPX_T_I32 : VOPC_Real_si <0x97>;
+
+defm V_CMP_F_I64 : VOPC_Real_si <0xa0>;
+defm V_CMP_LT_I64 : VOPC_Real_si <0xa1>;
+defm V_CMP_EQ_I64 : VOPC_Real_si <0xa2>;
+defm V_CMP_LE_I64 : VOPC_Real_si <0xa3>;
+defm V_CMP_GT_I64 : VOPC_Real_si <0xa4>;
+defm V_CMP_NE_I64 : VOPC_Real_si <0xa5>;
+defm V_CMP_GE_I64 : VOPC_Real_si <0xa6>;
+defm V_CMP_T_I64 : VOPC_Real_si <0xa7>;
+
+defm V_CMPX_F_I64 : VOPC_Real_si <0xb0>;
+defm V_CMPX_LT_I64 : VOPC_Real_si <0xb1>;
+defm V_CMPX_EQ_I64 : VOPC_Real_si <0xb2>;
+defm V_CMPX_LE_I64 : VOPC_Real_si <0xb3>;
+defm V_CMPX_GT_I64 : VOPC_Real_si <0xb4>;
+defm V_CMPX_NE_I64 : VOPC_Real_si <0xb5>;
+defm V_CMPX_GE_I64 : VOPC_Real_si <0xb6>;
+defm V_CMPX_T_I64 : VOPC_Real_si <0xb7>;
+
+defm V_CMP_F_U32 : VOPC_Real_si <0xc0>;
+defm V_CMP_LT_U32 : VOPC_Real_si <0xc1>;
+defm V_CMP_EQ_U32 : VOPC_Real_si <0xc2>;
+defm V_CMP_LE_U32 : VOPC_Real_si <0xc3>;
+defm V_CMP_GT_U32 : VOPC_Real_si <0xc4>;
+defm V_CMP_NE_U32 : VOPC_Real_si <0xc5>;
+defm V_CMP_GE_U32 : VOPC_Real_si <0xc6>;
+defm V_CMP_T_U32 : VOPC_Real_si <0xc7>;
+
+defm V_CMPX_F_U32 : VOPC_Real_si <0xd0>;
+defm V_CMPX_LT_U32 : VOPC_Real_si <0xd1>;
+defm V_CMPX_EQ_U32 : VOPC_Real_si <0xd2>;
+defm V_CMPX_LE_U32 : VOPC_Real_si <0xd3>;
+defm V_CMPX_GT_U32 : VOPC_Real_si <0xd4>;
+defm V_CMPX_NE_U32 : VOPC_Real_si <0xd5>;
+defm V_CMPX_GE_U32 : VOPC_Real_si <0xd6>;
+defm V_CMPX_T_U32 : VOPC_Real_si <0xd7>;
+
+defm V_CMP_F_U64 : VOPC_Real_si <0xe0>;
+defm V_CMP_LT_U64 : VOPC_Real_si <0xe1>;
+defm V_CMP_EQ_U64 : VOPC_Real_si <0xe2>;
+defm V_CMP_LE_U64 : VOPC_Real_si <0xe3>;
+defm V_CMP_GT_U64 : VOPC_Real_si <0xe4>;
+defm V_CMP_NE_U64 : VOPC_Real_si <0xe5>;
+defm V_CMP_GE_U64 : VOPC_Real_si <0xe6>;
+defm V_CMP_T_U64 : VOPC_Real_si <0xe7>;
+
+defm V_CMPX_F_U64 : VOPC_Real_si <0xf0>;
+defm V_CMPX_LT_U64 : VOPC_Real_si <0xf1>;
+defm V_CMPX_EQ_U64 : VOPC_Real_si <0xf2>;
+defm V_CMPX_LE_U64 : VOPC_Real_si <0xf3>;
+defm V_CMPX_GT_U64 : VOPC_Real_si <0xf4>;
+defm V_CMPX_NE_U64 : VOPC_Real_si <0xf5>;
+defm V_CMPX_GE_U64 : VOPC_Real_si <0xf6>;
+defm V_CMPX_T_U64 : VOPC_Real_si <0xf7>;
+
+defm V_CMP_CLASS_F32 : VOPC_Real_si <0x88>;
+defm V_CMPX_CLASS_F32 : VOPC_Real_si <0x98>;
+defm V_CMP_CLASS_F64 : VOPC_Real_si <0xa8>;
+defm V_CMPX_CLASS_F64 : VOPC_Real_si <0xb8>;
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+multiclass VOPC_Real_vi <bits<10> op> {
+ let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+ def _e32_vi :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
+ VOPCe<op{7-0}>;
+
+ def _e64_vi :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3a_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3
+ // Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
+ }
+
+ def _sdwa_vi :
+ VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
+ !cast<Instruction>(NAME#"_e32_vi")> {
+ let AssemblerPredicate = isVI;
+ }
+}
+
+defm V_CMP_CLASS_F32 : VOPC_Real_vi <0x10>;
+defm V_CMPX_CLASS_F32 : VOPC_Real_vi <0x11>;
+defm V_CMP_CLASS_F64 : VOPC_Real_vi <0x12>;
+defm V_CMPX_CLASS_F64 : VOPC_Real_vi <0x13>;
+defm V_CMP_CLASS_F16 : VOPC_Real_vi <0x14>;
+defm V_CMPX_CLASS_F16 : VOPC_Real_vi <0x15>;
+
+defm V_CMP_F_F16 : VOPC_Real_vi <0x20>;
+defm V_CMP_LT_F16 : VOPC_Real_vi <0x21>;
+defm V_CMP_EQ_F16 : VOPC_Real_vi <0x22>;
+defm V_CMP_LE_F16 : VOPC_Real_vi <0x23>;
+defm V_CMP_GT_F16 : VOPC_Real_vi <0x24>;
+defm V_CMP_LG_F16 : VOPC_Real_vi <0x25>;
+defm V_CMP_GE_F16 : VOPC_Real_vi <0x26>;
+defm V_CMP_O_F16 : VOPC_Real_vi <0x27>;
+defm V_CMP_U_F16 : VOPC_Real_vi <0x28>;
+defm V_CMP_NGE_F16 : VOPC_Real_vi <0x29>;
+defm V_CMP_NLG_F16 : VOPC_Real_vi <0x2a>;
+defm V_CMP_NGT_F16 : VOPC_Real_vi <0x2b>;
+defm V_CMP_NLE_F16 : VOPC_Real_vi <0x2c>;
+defm V_CMP_NEQ_F16 : VOPC_Real_vi <0x2d>;
+defm V_CMP_NLT_F16 : VOPC_Real_vi <0x2e>;
+defm V_CMP_TRU_F16 : VOPC_Real_vi <0x2f>;
+
+defm V_CMPX_F_F16 : VOPC_Real_vi <0x30>;
+defm V_CMPX_LT_F16 : VOPC_Real_vi <0x31>;
+defm V_CMPX_EQ_F16 : VOPC_Real_vi <0x32>;
+defm V_CMPX_LE_F16 : VOPC_Real_vi <0x33>;
+defm V_CMPX_GT_F16 : VOPC_Real_vi <0x34>;
+defm V_CMPX_LG_F16 : VOPC_Real_vi <0x35>;
+defm V_CMPX_GE_F16 : VOPC_Real_vi <0x36>;
+defm V_CMPX_O_F16 : VOPC_Real_vi <0x37>;
+defm V_CMPX_U_F16 : VOPC_Real_vi <0x38>;
+defm V_CMPX_NGE_F16 : VOPC_Real_vi <0x39>;
+defm V_CMPX_NLG_F16 : VOPC_Real_vi <0x3a>;
+defm V_CMPX_NGT_F16 : VOPC_Real_vi <0x3b>;
+defm V_CMPX_NLE_F16 : VOPC_Real_vi <0x3c>;
+defm V_CMPX_NEQ_F16 : VOPC_Real_vi <0x3d>;
+defm V_CMPX_NLT_F16 : VOPC_Real_vi <0x3e>;
+defm V_CMPX_TRU_F16 : VOPC_Real_vi <0x3f>;
+
+defm V_CMP_F_F32 : VOPC_Real_vi <0x40>;
+defm V_CMP_LT_F32 : VOPC_Real_vi <0x41>;
+defm V_CMP_EQ_F32 : VOPC_Real_vi <0x42>;
+defm V_CMP_LE_F32 : VOPC_Real_vi <0x43>;
+defm V_CMP_GT_F32 : VOPC_Real_vi <0x44>;
+defm V_CMP_LG_F32 : VOPC_Real_vi <0x45>;
+defm V_CMP_GE_F32 : VOPC_Real_vi <0x46>;
+defm V_CMP_O_F32 : VOPC_Real_vi <0x47>;
+defm V_CMP_U_F32 : VOPC_Real_vi <0x48>;
+defm V_CMP_NGE_F32 : VOPC_Real_vi <0x49>;
+defm V_CMP_NLG_F32 : VOPC_Real_vi <0x4a>;
+defm V_CMP_NGT_F32 : VOPC_Real_vi <0x4b>;
+defm V_CMP_NLE_F32 : VOPC_Real_vi <0x4c>;
+defm V_CMP_NEQ_F32 : VOPC_Real_vi <0x4d>;
+defm V_CMP_NLT_F32 : VOPC_Real_vi <0x4e>;
+defm V_CMP_TRU_F32 : VOPC_Real_vi <0x4f>;
+
+defm V_CMPX_F_F32 : VOPC_Real_vi <0x50>;
+defm V_CMPX_LT_F32 : VOPC_Real_vi <0x51>;
+defm V_CMPX_EQ_F32 : VOPC_Real_vi <0x52>;
+defm V_CMPX_LE_F32 : VOPC_Real_vi <0x53>;
+defm V_CMPX_GT_F32 : VOPC_Real_vi <0x54>;
+defm V_CMPX_LG_F32 : VOPC_Real_vi <0x55>;
+defm V_CMPX_GE_F32 : VOPC_Real_vi <0x56>;
+defm V_CMPX_O_F32 : VOPC_Real_vi <0x57>;
+defm V_CMPX_U_F32 : VOPC_Real_vi <0x58>;
+defm V_CMPX_NGE_F32 : VOPC_Real_vi <0x59>;
+defm V_CMPX_NLG_F32 : VOPC_Real_vi <0x5a>;
+defm V_CMPX_NGT_F32 : VOPC_Real_vi <0x5b>;
+defm V_CMPX_NLE_F32 : VOPC_Real_vi <0x5c>;
+defm V_CMPX_NEQ_F32 : VOPC_Real_vi <0x5d>;
+defm V_CMPX_NLT_F32 : VOPC_Real_vi <0x5e>;
+defm V_CMPX_TRU_F32 : VOPC_Real_vi <0x5f>;
+
+defm V_CMP_F_F64 : VOPC_Real_vi <0x60>;
+defm V_CMP_LT_F64 : VOPC_Real_vi <0x61>;
+defm V_CMP_EQ_F64 : VOPC_Real_vi <0x62>;
+defm V_CMP_LE_F64 : VOPC_Real_vi <0x63>;
+defm V_CMP_GT_F64 : VOPC_Real_vi <0x64>;
+defm V_CMP_LG_F64 : VOPC_Real_vi <0x65>;
+defm V_CMP_GE_F64 : VOPC_Real_vi <0x66>;
+defm V_CMP_O_F64 : VOPC_Real_vi <0x67>;
+defm V_CMP_U_F64 : VOPC_Real_vi <0x68>;
+defm V_CMP_NGE_F64 : VOPC_Real_vi <0x69>;
+defm V_CMP_NLG_F64 : VOPC_Real_vi <0x6a>;
+defm V_CMP_NGT_F64 : VOPC_Real_vi <0x6b>;
+defm V_CMP_NLE_F64 : VOPC_Real_vi <0x6c>;
+defm V_CMP_NEQ_F64 : VOPC_Real_vi <0x6d>;
+defm V_CMP_NLT_F64 : VOPC_Real_vi <0x6e>;
+defm V_CMP_TRU_F64 : VOPC_Real_vi <0x6f>;
+
+defm V_CMPX_F_F64 : VOPC_Real_vi <0x70>;
+defm V_CMPX_LT_F64 : VOPC_Real_vi <0x71>;
+defm V_CMPX_EQ_F64 : VOPC_Real_vi <0x72>;
+defm V_CMPX_LE_F64 : VOPC_Real_vi <0x73>;
+defm V_CMPX_GT_F64 : VOPC_Real_vi <0x74>;
+defm V_CMPX_LG_F64 : VOPC_Real_vi <0x75>;
+defm V_CMPX_GE_F64 : VOPC_Real_vi <0x76>;
+defm V_CMPX_O_F64 : VOPC_Real_vi <0x77>;
+defm V_CMPX_U_F64 : VOPC_Real_vi <0x78>;
+defm V_CMPX_NGE_F64 : VOPC_Real_vi <0x79>;
+defm V_CMPX_NLG_F64 : VOPC_Real_vi <0x7a>;
+defm V_CMPX_NGT_F64 : VOPC_Real_vi <0x7b>;
+defm V_CMPX_NLE_F64 : VOPC_Real_vi <0x7c>;
+defm V_CMPX_NEQ_F64 : VOPC_Real_vi <0x7d>;
+defm V_CMPX_NLT_F64 : VOPC_Real_vi <0x7e>;
+defm V_CMPX_TRU_F64 : VOPC_Real_vi <0x7f>;
+
+defm V_CMP_F_I16 : VOPC_Real_vi <0xa0>;
+defm V_CMP_LT_I16 : VOPC_Real_vi <0xa1>;
+defm V_CMP_EQ_I16 : VOPC_Real_vi <0xa2>;
+defm V_CMP_LE_I16 : VOPC_Real_vi <0xa3>;
+defm V_CMP_GT_I16 : VOPC_Real_vi <0xa4>;
+defm V_CMP_NE_I16 : VOPC_Real_vi <0xa5>;
+defm V_CMP_GE_I16 : VOPC_Real_vi <0xa6>;
+defm V_CMP_T_I16 : VOPC_Real_vi <0xa7>;
+
+defm V_CMP_F_U16 : VOPC_Real_vi <0xa8>;
+defm V_CMP_LT_U16 : VOPC_Real_vi <0xa9>;
+defm V_CMP_EQ_U16 : VOPC_Real_vi <0xaa>;
+defm V_CMP_LE_U16 : VOPC_Real_vi <0xab>;
+defm V_CMP_GT_U16 : VOPC_Real_vi <0xac>;
+defm V_CMP_NE_U16 : VOPC_Real_vi <0xad>;
+defm V_CMP_GE_U16 : VOPC_Real_vi <0xae>;
+defm V_CMP_T_U16 : VOPC_Real_vi <0xaf>;
+
+defm V_CMPX_F_I16 : VOPC_Real_vi <0xb0>;
+defm V_CMPX_LT_I16 : VOPC_Real_vi <0xb1>;
+defm V_CMPX_EQ_I16 : VOPC_Real_vi <0xb2>;
+defm V_CMPX_LE_I16 : VOPC_Real_vi <0xb3>;
+defm V_CMPX_GT_I16 : VOPC_Real_vi <0xb4>;
+defm V_CMPX_NE_I16 : VOPC_Real_vi <0xb5>;
+defm V_CMPX_GE_I16 : VOPC_Real_vi <0xb6>;
+defm V_CMPX_T_I16 : VOPC_Real_vi <0xb7>;
+
+defm V_CMPX_F_U16 : VOPC_Real_vi <0xb8>;
+defm V_CMPX_LT_U16 : VOPC_Real_vi <0xb9>;
+defm V_CMPX_EQ_U16 : VOPC_Real_vi <0xba>;
+defm V_CMPX_LE_U16 : VOPC_Real_vi <0xbb>;
+defm V_CMPX_GT_U16 : VOPC_Real_vi <0xbc>;
+defm V_CMPX_NE_U16 : VOPC_Real_vi <0xbd>;
+defm V_CMPX_GE_U16 : VOPC_Real_vi <0xbe>;
+defm V_CMPX_T_U16 : VOPC_Real_vi <0xbf>;
+
+defm V_CMP_F_I32 : VOPC_Real_vi <0xc0>;
+defm V_CMP_LT_I32 : VOPC_Real_vi <0xc1>;
+defm V_CMP_EQ_I32 : VOPC_Real_vi <0xc2>;
+defm V_CMP_LE_I32 : VOPC_Real_vi <0xc3>;
+defm V_CMP_GT_I32 : VOPC_Real_vi <0xc4>;
+defm V_CMP_NE_I32 : VOPC_Real_vi <0xc5>;
+defm V_CMP_GE_I32 : VOPC_Real_vi <0xc6>;
+defm V_CMP_T_I32 : VOPC_Real_vi <0xc7>;
+
+defm V_CMPX_F_I32 : VOPC_Real_vi <0xd0>;
+defm V_CMPX_LT_I32 : VOPC_Real_vi <0xd1>;
+defm V_CMPX_EQ_I32 : VOPC_Real_vi <0xd2>;
+defm V_CMPX_LE_I32 : VOPC_Real_vi <0xd3>;
+defm V_CMPX_GT_I32 : VOPC_Real_vi <0xd4>;
+defm V_CMPX_NE_I32 : VOPC_Real_vi <0xd5>;
+defm V_CMPX_GE_I32 : VOPC_Real_vi <0xd6>;
+defm V_CMPX_T_I32 : VOPC_Real_vi <0xd7>;
+
+defm V_CMP_F_I64 : VOPC_Real_vi <0xe0>;
+defm V_CMP_LT_I64 : VOPC_Real_vi <0xe1>;
+defm V_CMP_EQ_I64 : VOPC_Real_vi <0xe2>;
+defm V_CMP_LE_I64 : VOPC_Real_vi <0xe3>;
+defm V_CMP_GT_I64 : VOPC_Real_vi <0xe4>;
+defm V_CMP_NE_I64 : VOPC_Real_vi <0xe5>;
+defm V_CMP_GE_I64 : VOPC_Real_vi <0xe6>;
+defm V_CMP_T_I64 : VOPC_Real_vi <0xe7>;
+
+defm V_CMPX_F_I64 : VOPC_Real_vi <0xf0>;
+defm V_CMPX_LT_I64 : VOPC_Real_vi <0xf1>;
+defm V_CMPX_EQ_I64 : VOPC_Real_vi <0xf2>;
+defm V_CMPX_LE_I64 : VOPC_Real_vi <0xf3>;
+defm V_CMPX_GT_I64 : VOPC_Real_vi <0xf4>;
+defm V_CMPX_NE_I64 : VOPC_Real_vi <0xf5>;
+defm V_CMPX_GE_I64 : VOPC_Real_vi <0xf6>;
+defm V_CMPX_T_I64 : VOPC_Real_vi <0xf7>;
+
+defm V_CMP_F_U32 : VOPC_Real_vi <0xc8>;
+defm V_CMP_LT_U32 : VOPC_Real_vi <0xc9>;
+defm V_CMP_EQ_U32 : VOPC_Real_vi <0xca>;
+defm V_CMP_LE_U32 : VOPC_Real_vi <0xcb>;
+defm V_CMP_GT_U32 : VOPC_Real_vi <0xcc>;
+defm V_CMP_NE_U32 : VOPC_Real_vi <0xcd>;
+defm V_CMP_GE_U32 : VOPC_Real_vi <0xce>;
+defm V_CMP_T_U32 : VOPC_Real_vi <0xcf>;
+
+defm V_CMPX_F_U32 : VOPC_Real_vi <0xd8>;
+defm V_CMPX_LT_U32 : VOPC_Real_vi <0xd9>;
+defm V_CMPX_EQ_U32 : VOPC_Real_vi <0xda>;
+defm V_CMPX_LE_U32 : VOPC_Real_vi <0xdb>;
+defm V_CMPX_GT_U32 : VOPC_Real_vi <0xdc>;
+defm V_CMPX_NE_U32 : VOPC_Real_vi <0xdd>;
+defm V_CMPX_GE_U32 : VOPC_Real_vi <0xde>;
+defm V_CMPX_T_U32 : VOPC_Real_vi <0xdf>;
+
+defm V_CMP_F_U64 : VOPC_Real_vi <0xe8>;
+defm V_CMP_LT_U64 : VOPC_Real_vi <0xe9>;
+defm V_CMP_EQ_U64 : VOPC_Real_vi <0xea>;
+defm V_CMP_LE_U64 : VOPC_Real_vi <0xeb>;
+defm V_CMP_GT_U64 : VOPC_Real_vi <0xec>;
+defm V_CMP_NE_U64 : VOPC_Real_vi <0xed>;
+defm V_CMP_GE_U64 : VOPC_Real_vi <0xee>;
+defm V_CMP_T_U64 : VOPC_Real_vi <0xef>;
+
+defm V_CMPX_F_U64 : VOPC_Real_vi <0xf8>;
+defm V_CMPX_LT_U64 : VOPC_Real_vi <0xf9>;
+defm V_CMPX_EQ_U64 : VOPC_Real_vi <0xfa>;
+defm V_CMPX_LE_U64 : VOPC_Real_vi <0xfb>;
+defm V_CMPX_GT_U64 : VOPC_Real_vi <0xfc>;
+defm V_CMPX_NE_U64 : VOPC_Real_vi <0xfd>;
+defm V_CMPX_GE_U64 : VOPC_Real_vi <0xfe>;
+defm V_CMPX_T_U64 : VOPC_Real_vi <0xff>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td
new file mode 100644
index 0000000..5f72f97
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -0,0 +1,350 @@
+//===-- VOPInstructions.td - Vector Instruction Defintions ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// dummies for outer let
+class LetDummies {
+ bit isCommutable;
+ bit isConvertibleToThreeAddress;
+ bit isMoveImm;
+ bit isReMaterializable;
+ bit isAsCheapAsAMove;
+ bit VOPAsmPrefer32Bit;
+ Predicate SubtargetPredicate;
+ string Constraints;
+ string DisableEncoding;
+ list<SchedReadWrite> SchedRW;
+ list<Register> Uses;
+ list<Register> Defs;
+}
+
+class VOP <string opName> {
+ string OpName = opName;
+}
+
+class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+ let VALU = 1;
+ let Uses = [EXEC];
+}
+
+class VOP3Common <dag outs, dag ins, string asm = "",
+ list<dag> pattern = [], bit HasMods = 0,
+ bit VOP3Only = 0> :
+ VOPAnyCommon <outs, ins, asm, pattern> {
+
+ // Using complex patterns gives VOP3 patterns a very high complexity rating,
+ // but standalone patterns are almost always preferred, so we need to adjust the
+ // priority lower. The goal is to use a high number to reduce complexity to
+ // zero (or less than zero).
+ let AddedComplexity = -1000;
+
+ let VOP3 = 1;
+
+ let AsmMatchConverter =
+ !if(!eq(VOP3Only,1),
+ "cvtVOP3",
+ !if(!eq(HasMods,1), "cvtVOP3_2_mod", ""));
+
+ let AsmVariantName = AMDGPUAsmVariants.VOP3;
+
+ let isCodeGenOnly = 0;
+
+ int Size = 8;
+
+ // Because SGPRs may be allowed if there are multiple operands, we
+ // need a post-isel hook to insert copies in order to avoid
+ // violating constant bus requirements.
+ let hasPostISelHook = 1;
+}
+
+class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3Only = 0> :
+ InstSI <P.Outs64, P.Ins64, "", pattern>,
+ VOP <opName>,
+ SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
+ MnemonicAlias<opName#"_e64", opName> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = P.Asm64;
+
+ let Size = 8;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SubtargetPredicate = isGCN;
+
+ // Because SGPRs may be allowed if there are multiple operands, we
+ // need a post-isel hook to insert copies in order to avoid
+ // violating constant bus requirements.
+ let hasPostISelHook = 1;
+
+ // Using complex patterns gives VOP3 patterns a very high complexity rating,
+ // but standalone patterns are almost always preferred, so we need to adjust the
+ // priority lower. The goal is to use a high number to reduce complexity to
+ // zero (or less than zero).
+ let AddedComplexity = -1000;
+
+ let VOP3 = 1;
+ let VALU = 1;
+ let Uses = [EXEC];
+
+ let AsmVariantName = AMDGPUAsmVariants.VOP3;
+ let AsmMatchConverter =
+ !if(!eq(VOP3Only,1),
+ "cvtVOP3",
+ !if(!eq(P.HasModifiers, 1), "cvtVOP3_2_mod", ""));
+
+ VOPProfile Pfl = P;
+}
+
+class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let AsmVariantName = ps.AsmVariantName;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+}
+
+class VOP3a<VOPProfile P> : Enc64 {
+ bits<2> src0_modifiers;
+ bits<9> src0;
+ bits<2> src1_modifiers;
+ bits<9> src1;
+ bits<2> src2_modifiers;
+ bits<9> src2;
+ bits<1> clamp;
+ bits<2> omod;
+
+ let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
+ let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
+ let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
+
+ let Inst{31-26} = 0x34; //encoding
+ let Inst{40-32} = !if(P.HasSrc0, src0, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+ let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+ let Inst{60-59} = !if(P.HasOMod, omod, 0);
+ let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
+ let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
+ let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
+}
+
+class VOP3a_si <bits<9> op, VOPProfile P> : VOP3a<P> {
+ let Inst{25-17} = op;
+ let Inst{11} = !if(P.HasClamp, clamp{0}, 0);
+}
+
+class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> {
+ let Inst{25-16} = op;
+ let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+}
+
+class VOP3e_si <bits<9> op, VOPProfile P> : VOP3a_si <op, P> {
+ bits<8> vdst;
+ let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+}
+
+class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> {
+ bits<8> vdst;
+ let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+}
+
+class VOP3be <VOPProfile P> : Enc64 {
+ bits<8> vdst;
+ bits<2> src0_modifiers;
+ bits<9> src0;
+ bits<2> src1_modifiers;
+ bits<9> src1;
+ bits<2> src2_modifiers;
+ bits<9> src2;
+ bits<7> sdst;
+ bits<2> omod;
+
+ let Inst{7-0} = vdst;
+ let Inst{14-8} = sdst;
+ let Inst{31-26} = 0x34; //encoding
+ let Inst{40-32} = !if(P.HasSrc0, src0, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+ let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+ let Inst{60-59} = !if(P.HasOMod, omod, 0);
+ let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
+ let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
+ let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
+}
+
+class VOP3be_si <bits<9> op, VOPProfile P> : VOP3be<P> {
+ let Inst{25-17} = op;
+}
+
+class VOP3be_vi <bits<10> op, VOPProfile P> : VOP3be<P> {
+ bits<1> clamp;
+ let Inst{25-16} = op;
+ let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+}
+
+def SDWA {
+ // sdwa_sel
+ int BYTE_0 = 0;
+ int BYTE_1 = 1;
+ int BYTE_2 = 2;
+ int BYTE_3 = 3;
+ int WORD_0 = 4;
+ int WORD_1 = 5;
+ int DWORD = 6;
+
+ // dst_unused
+ int UNUSED_PAD = 0;
+ int UNUSED_SEXT = 1;
+ int UNUSED_PRESERVE = 2;
+}
+
+class VOP_SDWAe<VOPProfile P> : Enc64 {
+ bits<8> src0;
+ bits<3> src0_sel;
+ bits<2> src0_modifiers; // float: {abs,neg}, int {sext}
+ bits<3> src1_sel;
+ bits<2> src1_modifiers;
+ bits<3> dst_sel;
+ bits<2> dst_unused;
+ bits<1> clamp;
+
+ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
+ let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+ let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
+ let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+ let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
+ let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
+ let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+ let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
+ let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
+}
+
+class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
+ InstSI <P.OutsSDWA, P.InsSDWA, "", pattern>,
+ VOP <opName>,
+ SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>,
+ MnemonicAlias <opName#"_sdwa", opName> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = P.AsmSDWA;
+
+ let Size = 8;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+
+ let VALU = 1;
+ let SDWA = 1;
+ let Uses = [EXEC];
+
+ let SubtargetPredicate = isVI;
+ let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst);
+ let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA,
+ AMDGPUAsmVariants.Disable);
+ let DecoderNamespace = "SDWA";
+
+ VOPProfile Pfl = P;
+}
+
+class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let Defs = ps.Defs;
+ let Uses = ps.Uses;
+ let SchedRW = ps.SchedRW;
+ let hasSideEffects = ps.hasSideEffects;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ // Copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AssemblerPredicate = ps.AssemblerPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let AsmVariantName = ps.AsmVariantName;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let DecoderNamespace = ps.DecoderNamespace;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+}
+
+class VOP_DPPe<VOPProfile P> : Enc64 {
+ bits<2> src0_modifiers;
+ bits<8> src0;
+ bits<2> src1_modifiers;
+ bits<9> dpp_ctrl;
+ bits<1> bound_ctrl;
+ bits<4> bank_mask;
+ bits<4> row_mask;
+
+ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{48-40} = dpp_ctrl;
+ let Inst{51} = bound_ctrl;
+ let Inst{52} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // src0_neg
+ let Inst{53} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // src0_abs
+ let Inst{54} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // src1_neg
+ let Inst{55} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // src1_abs
+ let Inst{59-56} = bank_mask;
+ let Inst{63-60} = row_mask;
+}
+
+class VOP_DPP <string OpName, VOPProfile P> :
+ InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, []>,
+ VOP_DPPe<P> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+
+ let VALU = 1;
+ let DPP = 1;
+ let Size = 8;
+
+ let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
+ let SubtargetPredicate = isVI;
+ let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst);
+ let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
+ AMDGPUAsmVariants.Disable);
+ let DecoderNamespace = "DPP";
+}
+
+include "VOPCInstructions.td"
+include "VOP1Instructions.td"
+include "VOP2Instructions.td"
+include "VOP3Instructions.td"
OpenPOWER on IntegriCloud