diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU')
129 files changed, 23805 insertions, 13078 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h index d4784b5..7b0a7f4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -11,22 +11,18 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H -#include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetMachine.h" namespace llvm { -class AMDGPUInstrPrinter; -class AMDGPUSubtarget; class AMDGPUTargetMachine; class FunctionPass; class GCNTargetMachine; -struct MachineSchedContext; -class MCAsmInfo; -class raw_ostream; -class ScheduleDAGInstrs; +class ModulePass; +class Pass; class Target; class TargetMachine; +class PassRegistry; // R600 Passes FunctionPass *createR600VectorRegMerger(TargetMachine &tm); @@ -45,16 +41,12 @@ FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); FunctionPass *createSIWholeQuadModePass(); -FunctionPass *createSILowerControlFlowPass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIFixSGPRCopiesPass(); -FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr); -ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C); - ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; @@ -78,21 +70,30 @@ void initializeSIWholeQuadModePass(PassRegistry &); extern char &SIWholeQuadModeID; void initializeSILowerControlFlowPass(PassRegistry &); -extern char &SILowerControlFlowPassID; +extern char &SILowerControlFlowID; + +void initializeSIInsertSkipsPass(PassRegistry &); +extern char &SIInsertSkipsPassID; +void initializeSIOptimizeExecMaskingPass(PassRegistry &); +extern char &SIOptimizeExecMaskingID; // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); extern char &AMDGPUPromoteAllocaID; -FunctionPass *createAMDGPUAddDivergenceMetadata(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); -FunctionPass *createAMDGPUISelDag(TargetMachine &tm); +FunctionPass *createAMDGPUISelDag(TargetMachine &TM, + CodeGenOpt::Level OptLevel); ModulePass *createAMDGPUAlwaysInlinePass(); ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); FunctionPass *createAMDGPUAnnotateUniformValues(); +FunctionPass* createAMDGPUUnifyMetadataPass(); +void initializeAMDGPUUnifyMetadataPass(PassRegistry&); +extern char &AMDGPUUnifyMetadataID; + void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); extern char &SIFixControlFlowLiveIntervalsID; @@ -111,8 +112,8 @@ extern char &SIDebuggerInsertNopsID; void initializeSIInsertWaitsPass(PassRegistry&); extern char &SIInsertWaitsID; -extern Target TheAMDGPUTarget; -extern Target TheGCNTarget; +Target &getTheAMDGPUTarget(); +Target &getTheGCNTarget(); namespace AMDGPU { enum TargetIndex { diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td index 72c4553..1302200 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -67,6 +67,19 @@ def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access", "Support unaligned global loads and stores" >; +def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access", + "UnalignedScratchAccess", + "true", + "Support unaligned scratch loads and stores" +>; + +// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support +// XNACK. The current default kernel driver setting is: +// - graphics ring: XNACK disabled +// - compute ring: XNACK enabled +// +// If XNACK is enabled, the VMEM latency can be worse. +// If XNACK is disabled, the 2 SGPRs can be used for general purposes. def FeatureXNACK : SubtargetFeature<"xnack", "EnableXNACK", "true", @@ -110,20 +123,6 @@ class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature < def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; -class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping> - : SubtargetFeature < - "isaver"#Major#"."#Minor#"."#Stepping, - "IsaVersion", - "ISAVersion"#Major#"_"#Minor#"_"#Stepping, - "Instruction set version number" ->; - -def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0>; -def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1>; -def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0>; -def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>; -def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>; - class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< "localmemorysize"#Value, "LocalMemorySize", @@ -161,16 +160,46 @@ def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime", "Has s_memrealtime instruction" >; +def FeatureInv2PiInlineImm : SubtargetFeature<"inv-2pi-inline-imm", + "HasInv2PiInlineImm", + "true", + "Has 1 / (2 * pi) as inline immediate" +>; + def Feature16BitInsts : SubtargetFeature<"16-bit-insts", "Has16BitInsts", "true", "Has i16/f16 instructions" >; +def FeatureMovrel : SubtargetFeature<"movrel", + "HasMovrel", + "true", + "Has v_movrel*_b32 instructions" +>; + +def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode", + "HasVGPRIndexMode", + "true", + "Has VGPR mode register indexing" +>; + +def FeatureScalarStores : SubtargetFeature<"scalar-stores", + "HasScalarStores", + "true", + "Has store scalar memory instructions" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// +def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals", + "FP16Denormals", + "true", + "Enable half precision denormal handling" +>; + // Some instructions do not support denormals despite this flag. Using // fp32 denormals also causes instructions to run at the double // precision rate for the device. @@ -253,6 +282,12 @@ def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler", "Enable SI Machine Scheduler" >; +// Unless +-flat-for-global is specified, turn on FlatForGlobal for +// all OS-es on VI and newer hardware to avoid assertion failures due +// to missing ADDR64 variants of MUBUF instructions. +// FIXME: moveToVALU should be able to handle converting addr64 MUBUF +// instructions. + def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", "FlatForGlobal", "true", @@ -294,23 +329,76 @@ def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", [FeatureFP64, FeatureLocalMemorySize32768, FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, - FeatureLDSBankCount32] + FeatureLDSBankCount32, FeatureMovrel] >; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", [FeatureFP64, FeatureLocalMemorySize65536, FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, - FeatureGCN1Encoding, FeatureCIInsts] + FeatureGCN1Encoding, FeatureCIInsts, FeatureMovrel] >; def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", [FeatureFP64, FeatureLocalMemorySize65536, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, - FeatureSMemRealTime + FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, + FeatureScalarStores, FeatureInv2PiInlineImm ] >; +class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping, + list<SubtargetFeature> Implies> + : SubtargetFeature < + "isaver"#Major#"."#Minor#"."#Stepping, + "IsaVersion", + "ISAVersion"#Major#"_"#Minor#"_"#Stepping, + "Instruction set version number", + Implies +>; + +def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0, + [FeatureSeaIslands, + FeatureLDSBankCount32]>; + +def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1, + [FeatureSeaIslands, + HalfRate64Ops, + FeatureLDSBankCount32, + FeatureFastFMAF32]>; + +def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2, + [FeatureSeaIslands, + FeatureLDSBankCount16]>; + +def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0, + [FeatureVolcanicIslands, + FeatureLDSBankCount32, + FeatureSGPRInitBug]>; + +def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, + [FeatureVolcanicIslands, + FeatureLDSBankCount32, + FeatureXNACK]>; + +def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2, + [FeatureVolcanicIslands, + FeatureLDSBankCount32, + FeatureSGPRInitBug]>; + +def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3, + [FeatureVolcanicIslands, + FeatureLDSBankCount32]>; + +def FeatureISAVersion8_0_4 : SubtargetFeatureISAVersion <8,0,4, + [FeatureVolcanicIslands, + FeatureLDSBankCount32]>; + +def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, + [FeatureVolcanicIslands, + FeatureLDSBankCount16, + FeatureXNACK]>; + //===----------------------------------------------------------------------===// // Debugger related subtarget features. //===----------------------------------------------------------------------===// @@ -349,10 +437,52 @@ def AMDGPUAsmParser : AsmParser { let ShouldEmitMatchRegisterName = 0; } +def AMDGPUAsmWriter : AsmWriter { + int PassSubtarget = 1; +} + +def AMDGPUAsmVariants { + string Default = "Default"; + int Default_ID = 0; + string VOP3 = "VOP3"; + int VOP3_ID = 1; + string SDWA = "SDWA"; + int SDWA_ID = 2; + string DPP = "DPP"; + int DPP_ID = 3; + string Disable = "Disable"; + int Disable_ID = 4; +} + +def DefaultAMDGPUAsmParserVariant : AsmParserVariant { + let Variant = AMDGPUAsmVariants.Default_ID; + let Name = AMDGPUAsmVariants.Default; +} + +def VOP3AsmParserVariant : AsmParserVariant { + let Variant = AMDGPUAsmVariants.VOP3_ID; + let Name = AMDGPUAsmVariants.VOP3; +} + +def SDWAAsmParserVariant : AsmParserVariant { + let Variant = AMDGPUAsmVariants.SDWA_ID; + let Name = AMDGPUAsmVariants.SDWA; +} + +def DPPAsmParserVariant : AsmParserVariant { + let Variant = AMDGPUAsmVariants.DPP_ID; + let Name = AMDGPUAsmVariants.DPP; +} + def AMDGPU : Target { // Pull in Instruction Info: let InstructionSet = AMDGPUInstrInfo; let AssemblyParsers = [AMDGPUAsmParser]; + let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant, + VOP3AsmParserVariant, + SDWAAsmParserVariant, + DPPAsmParserVariant]; + let AssemblyWriters = [AMDGPUAsmWriter]; } // Dummy Instruction itineraries for pseudo instructions @@ -381,6 +511,8 @@ def isCIVI : Predicate < def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; +def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">; + class PredicateControl { Predicate SubtargetPredicate; Predicate SIAssemblerPredicate = isSICI; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index 63f5fb3..067a16a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -27,7 +27,7 @@ class AMDGPUAlwaysInline : public ModulePass { public: AMDGPUAlwaysInline() : ModulePass(ID) { } bool runOnModule(Module &M) override; - const char *getPassName() const override { return "AMDGPU Always Inline Pass"; } + StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; } }; } // End anonymous namespace @@ -35,8 +35,20 @@ public: char AMDGPUAlwaysInline::ID = 0; bool AMDGPUAlwaysInline::runOnModule(Module &M) { + std::vector<GlobalAlias*> AliasesToRemove; std::vector<Function *> FuncsToClone; + for (GlobalAlias &A : M.aliases()) { + if (Function* F = dyn_cast<Function>(A.getAliasee())) { + A.replaceAllUsesWith(F); + AliasesToRemove.push_back(&A); + } + } + + for (GlobalAlias* A : AliasesToRemove) { + A->eraseFromParent(); + } + for (Function &F : M) { if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() && !F.hasFnAttribute(Attribute::NoInline)) diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 0910b28..c98d25e2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "llvm/ADT/Triple.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -35,7 +36,7 @@ public: AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { } bool runOnModule(Module &M) override; - const char *getPassName() const override { + StringRef getPassName() const override { return "AMDGPU Annotate Kernel Features"; } @@ -188,7 +189,8 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { static const StringRef HSAIntrinsicToAttr[][2] = { { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" }, - { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" } + { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" }, + { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" } }; // TODO: We should not add the attributes if the known compile time workgroup @@ -200,7 +202,7 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { // always initialized. bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); - if (TT.getOS() == Triple::AMDHSA) { + if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) { Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); for (Function &F : M) { diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index 2010cc9..c011be6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -15,7 +15,10 @@ #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/Debug.h" @@ -30,6 +33,10 @@ namespace { class AMDGPUAnnotateUniformValues : public FunctionPass, public InstVisitor<AMDGPUAnnotateUniformValues> { DivergenceAnalysis *DA; + MemoryDependenceResults *MDR; + LoopInfo *LI; + DenseMap<Value*, GetElementPtrInst*> noClobberClones; + bool isKernelFunc; public: static char ID; @@ -37,15 +44,19 @@ public: FunctionPass(ID) { } bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; - const char *getPassName() const override { return "AMDGPU Annotate Uniform Values"; } + StringRef getPassName() const override { + return "AMDGPU Annotate Uniform Values"; + } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DivergenceAnalysis>(); + AU.addRequired<MemoryDependenceWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.setPreservesAll(); } void visitBranchInst(BranchInst &I); void visitLoadInst(LoadInst &I); - + bool isClobberedInFunction(LoadInst * Load); }; } // End anonymous namespace @@ -53,6 +64,8 @@ public: INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) @@ -61,6 +74,46 @@ char AMDGPUAnnotateUniformValues::ID = 0; static void setUniformMetadata(Instruction *I) { I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {})); } +static void setNoClobberMetadata(Instruction *I) { + I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {})); +} + +static void DFS(BasicBlock *Root, SetVector<BasicBlock*> & Set) { + for (auto I : predecessors(Root)) + if (Set.insert(I)) + DFS(I, Set); +} + +bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) { + // 1. get Loop for the Load->getparent(); + // 2. if it exists, collect all the BBs from the most outer + // loop and check for the writes. If NOT - start DFS over all preds. + // 3. Start DFS over all preds from the most outer loop header. + SetVector<BasicBlock *> Checklist; + BasicBlock *Start = Load->getParent(); + Checklist.insert(Start); + const Value *Ptr = Load->getPointerOperand(); + const Loop *L = LI->getLoopFor(Start); + if (L) { + const Loop *P = L; + do { + L = P; + P = P->getParentLoop(); + } while (P); + Checklist.insert(L->block_begin(), L->block_end()); + Start = L->getHeader(); + } + + DFS(Start, Checklist); + for (auto &BB : Checklist) { + BasicBlock::iterator StartIt = (BB == Load->getParent()) ? + BasicBlock::iterator(Load) : BB->end(); + if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr), + true, StartIt, BB, Load).isClobber()) + return true; + } + return false; +} void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { if (I.isUnconditional()) @@ -77,10 +130,39 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Ptr = I.getPointerOperand(); if (!DA->isUniform(Ptr)) return; - - if (Instruction *PtrI = dyn_cast<Instruction>(Ptr)) + auto isGlobalLoad = [](LoadInst &Load)->bool { + return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; + }; + // We're tracking up to the Function boundaries + // We cannot go beyond because of FunctionPass restrictions + // Thus we can ensure that memory not clobbered for memory + // operations that live in kernel only. + bool NotClobbered = isKernelFunc && !isClobberedInFunction(&I); + Instruction *PtrI = dyn_cast<Instruction>(Ptr); + if (!PtrI && NotClobbered && isGlobalLoad(I)) { + if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) { + // Lookup for the existing GEP + if (noClobberClones.count(Ptr)) { + PtrI = noClobberClones[Ptr]; + } else { + // Create GEP of the Value + Function *F = I.getParent()->getParent(); + Value *Idx = Constant::getIntegerValue( + Type::getInt32Ty(Ptr->getContext()), APInt(64, 0)); + // Insert GEP at the entry to make it dominate all uses + PtrI = GetElementPtrInst::Create( + Ptr->getType()->getPointerElementType(), Ptr, + ArrayRef<Value*>(Idx), Twine(""), F->getEntryBlock().getFirstNonPHI()); + } + I.replaceUsesOfWith(Ptr, PtrI); + } + } + + if (PtrI) { setUniformMetadata(PtrI); - + if (NotClobbered) + setNoClobberMetadata(PtrI); + } } bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { @@ -91,9 +173,13 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { if (skipFunction(F)) return false; - DA = &getAnalysis<DivergenceAnalysis>(); - visit(F); + DA = &getAnalysis<DivergenceAnalysis>(); + MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL; + visit(F); + noClobberClones.clear(); return true; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index c9c95c7..974e79f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -39,9 +39,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" -#include "AMDGPURuntimeMetadata.h" -using namespace ::AMDGPU; using namespace llvm; // TODO: This should get the default rounding mode from the kernel. We just set @@ -87,13 +85,19 @@ createAMDGPUAsmPrinterPass(TargetMachine &tm, } extern "C" void LLVMInitializeAMDGPUAsmPrinter() { - TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass); - TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass); + TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(), + createAMDGPUAsmPrinterPass); + TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), + createAMDGPUAsmPrinterPass); } AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) - : AsmPrinter(TM, std::move(Streamer)) {} + : AsmPrinter(TM, std::move(Streamer)) {} + +StringRef AMDGPUAsmPrinter::getPassName() const { + return "AMDGPU Assembly Printer"; +} void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { if (TM.getTargetTriple().getOS() != Triple::AMDHSA) @@ -113,13 +117,30 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits()); TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); - emitStartOfRuntimeMetadata(M); + + // Emit runtime metadata. + TS->EmitRuntimeMetadata(M); } +bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( + const MachineBasicBlock *MBB) const { + if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB)) + return false; + + if (MBB->empty()) + return true; + + // If this is a block implementing a long branch, an expression relative to + // the start of the block is needed. to the start of the block. + // XXX - Is there a smarter way to check this? + return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); +} + + void AMDGPUAsmPrinter::EmitFunctionBodyStart() { const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; - if (STM.isAmdHsaOS()) { + if (STM.isAmdCodeObjectV2(*MF)) { getSIProgramInfo(KernelInfo, *MF); EmitAmdKernelCodeT(*MF, KernelInfo); } @@ -128,11 +149,12 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); - if (MFI->isKernel() && STM.isAmdHsaOS()) { + if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) { AMDGPUTargetStreamer *TS = static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); - TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(), - ELF::STT_AMDGPU_HSA_KERNEL); + SmallString<128> SymbolName; + getNameWithPrefix(SymbolName, MF->getFunction()), + TS->EmitAMDGPUSymbolType(SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); } AsmPrinter::EmitFunctionEntryLabel(); @@ -154,12 +176,14 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { SetupMachineFunction(MF); + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); MCContext &Context = getObjFileLowering().getContext(); - MCSectionELF *ConfigSection = - Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(ConfigSection); + if (!STM.isAmdHsaOS()) { + MCSectionELF *ConfigSection = + Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(ConfigSection); + } - const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { getSIProgramInfo(KernelInfo, MF); @@ -198,6 +222,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) + " bytes/workgroup (compile time only)", false); + OutStreamer->emitRawComment(" SGPRBlocks: " + + Twine(KernelInfo.SGPRBlocks), false); + OutStreamer->emitRawComment(" VGPRBlocks: " + + Twine(KernelInfo.VGPRBlocks), false); + + OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " + + Twine(KernelInfo.NumSGPRsForWavesPerEU), false); + OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " + + Twine(KernelInfo.NumVGPRsForWavesPerEU), false); + OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst), false); OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount), @@ -229,7 +263,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { } else { R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); OutStreamer->emitRawComment( - Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize))); + Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize))); } } @@ -247,8 +281,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { } } - emitRuntimeMetadata(*MF.getFunction()); - return false; } @@ -282,7 +314,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { if (STM.getGeneration() >= R600Subtarget::EVERGREEN) { // Evergreen / Northern Islands switch (MF.getFunction()->getCallingConv()) { - default: // Fall through + default: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; @@ -291,9 +323,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { } else { // R600 / R700 switch (MF.getFunction()->getCallingConv()) { - default: // Fall through - case CallingConv::AMDGPU_GS: // Fall through - case CallingConv::AMDGPU_CS: // Fall through + default: LLVM_FALLTHROUGH; + case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH; + case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; } @@ -301,13 +333,13 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { OutStreamer->EmitIntValue(RsrcReg, 4); OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | - S_STACK_SIZE(MFI->StackSize), 4); + S_STACK_SIZE(MFI->CFStackSize), 4); OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); - OutStreamer->EmitIntValue(alignTo(MFI->LDSSize, 4) >> 2, 4); + OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4); } } @@ -331,7 +363,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (MI.isDebugValue()) continue; - CodeSize += TII->getInstSizeInBytes(MI); + if (isVerbose()) + CodeSize += TII->getInstSizeInBytes(MI); unsigned numOperands = MI.getNumOperands(); for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { @@ -360,7 +393,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, case AMDGPU::FLAT_SCR: case AMDGPU::FLAT_SCR_LO: case AMDGPU::FLAT_SCR_HI: - FlatUsed = true; + // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat + // instructions aren't used to access the scratch buffer. + if (MFI->hasFlatScratchInit()) + FlatUsed = true; continue; case AMDGPU::TBA: @@ -369,26 +405,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, case AMDGPU::TMA: case AMDGPU::TMA_LO: case AMDGPU::TMA_HI: - llvm_unreachable("Trap Handler registers should not be used"); - continue; + llvm_unreachable("trap handler registers should not be used"); default: break; } if (AMDGPU::SReg_32RegClass.contains(reg)) { - if (AMDGPU::TTMP_32RegClass.contains(reg)) { - llvm_unreachable("Trap Handler registers should not be used"); - } + assert(!AMDGPU::TTMP_32RegClass.contains(reg) && + "trap handler registers should not be used"); isSGPR = true; width = 1; } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { isSGPR = false; width = 1; } else if (AMDGPU::SReg_64RegClass.contains(reg)) { - if (AMDGPU::TTMP_64RegClass.contains(reg)) { - llvm_unreachable("Trap Handler registers should not be used"); - } + assert(!AMDGPU::TTMP_64RegClass.contains(reg) && + "trap handler registers should not be used"); isSGPR = true; width = 2; } else if (AMDGPU::VReg_64RegClass.contains(reg)) { @@ -445,20 +478,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ExtraSGPRs = 6; } - MaxSGPR += ExtraSGPRs; - // Record first reserved register and reserved register count fields, and // update max register counts if "amdgpu-debugger-reserve-regs" attribute was - // specified. - if (STM.debuggerReserveRegs()) { - ProgInfo.ReservedVGPRFirst = MaxVGPR + 1; - ProgInfo.ReservedVGPRCount = MFI->getDebuggerReservedVGPRCount(); - MaxVGPR += MFI->getDebuggerReservedVGPRCount(); - } + // requested. + ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0; + ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM); // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" - // attribute was specified. + // attribute was requested. if (STM.debuggerEmitPrologue()) { ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); @@ -466,21 +494,59 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, RI->getHWRegIndex(MFI->getScratchRSrcReg()); } + // Check the addressable register limit before we add ExtraSGPRs. + if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && + !STM.hasSGPRInitBug()) { + unsigned MaxAddressableNumSGPRs = STM.getMaxNumSGPRs(); + if (MaxSGPR + 1 > MaxAddressableNumSGPRs) { + // This can happen due to a compiler bug or when using inline asm. + LLVMContext &Ctx = MF.getFunction()->getContext(); + DiagnosticInfoResourceLimit Diag(*MF.getFunction(), + "addressable scalar registers", + MaxSGPR + 1, DS_Error, + DK_ResourceLimit, MaxAddressableNumSGPRs); + Ctx.diagnose(Diag); + MaxSGPR = MaxAddressableNumSGPRs - 1; + } + } + + // Account for extra SGPRs and VGPRs reserved for debugger use. + MaxSGPR += ExtraSGPRs; + MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM); + // We found the maximum register index. They start at 0, so add one to get the // number of registers. ProgInfo.NumVGPR = MaxVGPR + 1; ProgInfo.NumSGPR = MaxSGPR + 1; - if (STM.hasSGPRInitBug()) { - if (ProgInfo.NumSGPR > SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) { + // Adjust number of registers used to meet default/requested minimum/maximum + // number of waves per execution unit request. + ProgInfo.NumSGPRsForWavesPerEU = std::max( + ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU())); + ProgInfo.NumVGPRsForWavesPerEU = std::max( + ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU())); + + if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || + STM.hasSGPRInitBug()) { + unsigned MaxNumSGPRs = STM.getMaxNumSGPRs(); + if (ProgInfo.NumSGPR > MaxNumSGPRs) { + // This can happen due to a compiler bug or when using inline asm to use the + // registers which are usually reserved for vcc etc. + LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), - "SGPRs with SGPR init bug", - ProgInfo.NumSGPR, DS_Error); + "scalar registers", + ProgInfo.NumSGPR, DS_Error, + DK_ResourceLimit, MaxNumSGPRs); Ctx.diagnose(Diag); + ProgInfo.NumSGPR = MaxNumSGPRs; + ProgInfo.NumSGPRsForWavesPerEU = MaxNumSGPRs; } + } + if (STM.hasSGPRInitBug()) { ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; } if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { @@ -490,26 +556,34 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, Ctx.diagnose(Diag); } - if (MFI->LDSSize > static_cast<unsigned>(STM.getLocalMemorySize())) { + if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) { LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory", - MFI->LDSSize, DS_Error); + MFI->getLDSSize(), DS_Error); Ctx.diagnose(Diag); } - ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; - ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; + // SGPRBlocks is actual number of SGPR blocks minus 1. + ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU, + RI->getSGPRAllocGranule()); + ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1; + + // VGPRBlocks is actual number of VGPR blocks minus 1. + ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU, + RI->getVGPRAllocGranule()); + ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1; + // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode // register. ProgInfo.FloatMode = getFPMode(MF); - ProgInfo.IEEEMode = 0; + ProgInfo.IEEEMode = STM.enableIEEEBit(MF); // Make clamp modifier on NaN input returns 0. ProgInfo.DX10Clamp = 1; - const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - ProgInfo.ScratchSize = FrameInfo->getStackSize(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + ProgInfo.ScratchSize = FrameInfo.getStackSize(); ProgInfo.FlatUsed = FlatUsed; ProgInfo.VCCUsed = VCCUsed; @@ -524,10 +598,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, LDSAlignShift = 9; } - unsigned LDSSpillSize = MFI->LDSWaveSpillSize * - MFI->getMaximumWorkGroupSize(MF); + unsigned LDSSpillSize = + MFI->LDSWaveSpillSize * MFI->getMaxFlatWorkGroupSize(); - ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize; + ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize; ProgInfo.LDSBlocks = alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; @@ -573,7 +647,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, static unsigned getRsrcReg(CallingConv::ID CallConv) { switch (CallConv) { - default: // Fall through + default: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; @@ -703,7 +777,9 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, if (STM.isXNACKEnabled()) header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; - header.kernarg_segment_byte_size = MFI->ABIArgOffset; + // FIXME: Should use getKernArgSize + header.kernarg_segment_byte_size = + STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset()); header.wavefront_sgpr_count = KernelInfo.NumSGPR; header.workitem_vgpr_count = KernelInfo.NumVGPR; header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; @@ -711,6 +787,11 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; + // These alignment values are specified in powers of two, so alignment = + // 2^n. The minimum alignment is 2^4 = 16. + header.kernarg_segment_alignment = std::max((size_t)4, + countTrailingZeros(MFI->getMaxKernArgAlign())); + if (STM.debuggerEmitPrologue()) { header.debug_wavefront_private_segment_offset_sgpr = KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; @@ -745,231 +826,3 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); return false; } - -// Emit a key and an integer value for runtime metadata. -static void emitRuntimeMDIntValue(std::unique_ptr<MCStreamer> &Streamer, - RuntimeMD::Key K, uint64_t V, - unsigned Size) { - Streamer->EmitIntValue(K, 1); - Streamer->EmitIntValue(V, Size); -} - -// Emit a key and a string value for runtime metadata. -static void emitRuntimeMDStringValue(std::unique_ptr<MCStreamer> &Streamer, - RuntimeMD::Key K, StringRef S) { - Streamer->EmitIntValue(K, 1); - Streamer->EmitIntValue(S.size(), 4); - Streamer->EmitBytes(S); -} - -// Emit a key and three integer values for runtime metadata. -// The three integer values are obtained from MDNode \p Node; -static void emitRuntimeMDThreeIntValues(std::unique_ptr<MCStreamer> &Streamer, - RuntimeMD::Key K, MDNode *Node, - unsigned Size) { - Streamer->EmitIntValue(K, 1); - Streamer->EmitIntValue(mdconst::extract<ConstantInt>( - Node->getOperand(0))->getZExtValue(), Size); - Streamer->EmitIntValue(mdconst::extract<ConstantInt>( - Node->getOperand(1))->getZExtValue(), Size); - Streamer->EmitIntValue(mdconst::extract<ConstantInt>( - Node->getOperand(2))->getZExtValue(), Size); -} - -void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) { - OutStreamer->SwitchSection(getObjFileLowering().getContext() - .getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0)); - - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion, - RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2); - if (auto MD = M.getNamedMetadata("opencl.ocl.version")) { - if (MD->getNumOperands()) { - auto Node = MD->getOperand(0); - if (Node->getNumOperands() > 1) { - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage, - RuntimeMD::OpenCL_C, 1); - uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0)) - ->getZExtValue(); - uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1)) - ->getZExtValue(); - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion, - Major * 100 + Minor * 10, 2); - } - } - } -} - -static std::string getOCLTypeName(Type *Ty, bool isSigned) { - if (VectorType* VecTy = dyn_cast<VectorType>(Ty)) { - Type* EleTy = VecTy->getElementType(); - unsigned Size = VecTy->getVectorNumElements(); - return (Twine(getOCLTypeName(EleTy, isSigned)) + Twine(Size)).str(); - } - switch (Ty->getTypeID()) { - case Type::HalfTyID: return "half"; - case Type::FloatTyID: return "float"; - case Type::DoubleTyID: return "double"; - case Type::IntegerTyID: { - if (!isSigned) - return (Twine('u') + Twine(getOCLTypeName(Ty, true))).str(); - auto IntTy = cast<IntegerType>(Ty); - auto BW = IntTy->getIntegerBitWidth(); - switch (BW) { - case 8: - return "char"; - case 16: - return "short"; - case 32: - return "int"; - case 64: - return "long"; - default: - return (Twine('i') + Twine(BW)).str(); - } - } - default: - llvm_unreachable("invalid type"); - } -} - -static RuntimeMD::KernelArg::ValueType getRuntimeMDValueType( - Type *Ty, StringRef TypeName) { - if (auto VT = dyn_cast<VectorType>(Ty)) - return getRuntimeMDValueType(VT->getElementType(), TypeName); - else if (auto PT = dyn_cast<PointerType>(Ty)) - return getRuntimeMDValueType(PT->getElementType(), TypeName); - else if (Ty->isHalfTy()) - return RuntimeMD::KernelArg::F16; - else if (Ty->isFloatTy()) - return RuntimeMD::KernelArg::F32; - else if (Ty->isDoubleTy()) - return RuntimeMD::KernelArg::F64; - else if (IntegerType* intTy = dyn_cast<IntegerType>(Ty)) { - bool Signed = !TypeName.startswith("u"); - switch (intTy->getIntegerBitWidth()) { - case 8: - return Signed ? RuntimeMD::KernelArg::I8 : RuntimeMD::KernelArg::U8; - case 16: - return Signed ? RuntimeMD::KernelArg::I16 : RuntimeMD::KernelArg::U16; - case 32: - return Signed ? RuntimeMD::KernelArg::I32 : RuntimeMD::KernelArg::U32; - case 64: - return Signed ? RuntimeMD::KernelArg::I64 : RuntimeMD::KernelArg::U64; - default: - // Runtime does not recognize other integer types. Report as - // struct type. - return RuntimeMD::KernelArg::Struct; - } - } else - return RuntimeMD::KernelArg::Struct; -} - -void AMDGPUAsmPrinter::emitRuntimeMetadata(const Function &F) { - if (!F.getMetadata("kernel_arg_type")) - return; - - MCContext &Context = getObjFileLowering().getContext(); - OutStreamer->SwitchSection( - Context.getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0)); - OutStreamer->EmitIntValue(RuntimeMD::KeyKernelBegin, 1); - emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyKernelName, F.getName()); - - for (auto &Arg:F.args()) { - // Emit KeyArgBegin. - unsigned I = Arg.getArgNo(); - OutStreamer->EmitIntValue(RuntimeMD::KeyArgBegin, 1); - - // Emit KeyArgSize and KeyArgAlign. - auto T = Arg.getType(); - auto DL = F.getParent()->getDataLayout(); - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgSize, - DL.getTypeAllocSize(T), 4); - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAlign, - DL.getABITypeAlignment(T), 4); - - // Emit KeyArgTypeName. - auto TypeName = dyn_cast<MDString>(F.getMetadata( - "kernel_arg_type")->getOperand(I))->getString(); - emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgTypeName, TypeName); - - // Emit KeyArgName. - if (auto ArgNameMD = F.getMetadata("kernel_arg_name")) { - auto ArgName = cast<MDString>(ArgNameMD->getOperand( - I))->getString(); - emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgName, ArgName); - } - - // Emit KeyArgIsVolatile, KeyArgIsRestrict, KeyArgIsConst and KeyArgIsPipe. - auto TypeQual = cast<MDString>(F.getMetadata( - "kernel_arg_type_qual")->getOperand(I))->getString(); - SmallVector<StringRef, 1> SplitQ; - TypeQual.split(SplitQ, " ", -1, false/* drop empty entry*/); - for (auto &I:SplitQ) { - auto Key = StringSwitch<RuntimeMD::Key>(I) - .Case("volatile", RuntimeMD::KeyArgIsVolatile) - .Case("restrict", RuntimeMD::KeyArgIsRestrict) - .Case("const", RuntimeMD::KeyArgIsConst) - .Case("pipe", RuntimeMD::KeyArgIsPipe) - .Default(RuntimeMD::KeyNull); - OutStreamer->EmitIntValue(Key, 1); - } - - // Emit KeyArgTypeKind. - auto BaseTypeName = cast<MDString>( - F.getMetadata("kernel_arg_base_type")->getOperand(I))->getString(); - auto TypeKind = StringSwitch<RuntimeMD::KernelArg::TypeKind>(BaseTypeName) - .Case("sampler_t", RuntimeMD::KernelArg::Sampler) - .Case("queue_t", RuntimeMD::KernelArg::Queue) - .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t", - "image2d_t" , "image2d_array_t", RuntimeMD::KernelArg::Image) - .Cases("image2d_depth_t", "image2d_array_depth_t", - "image2d_msaa_t", "image2d_array_msaa_t", - "image2d_msaa_depth_t", RuntimeMD::KernelArg::Image) - .Cases("image2d_array_msaa_depth_t", "image3d_t", - RuntimeMD::KernelArg::Image) - .Default(isa<PointerType>(T) ? RuntimeMD::KernelArg::Pointer : - RuntimeMD::KernelArg::Value); - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgTypeKind, TypeKind, 1); - - // Emit KeyArgValueType. - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgValueType, - getRuntimeMDValueType(T, BaseTypeName), 2); - - // Emit KeyArgAccQual. - auto AccQual = cast<MDString>(F.getMetadata( - "kernel_arg_access_qual")->getOperand(I))->getString(); - auto AQ = StringSwitch<RuntimeMD::KernelArg::AccessQualifer>(AccQual) - .Case("read_only", RuntimeMD::KernelArg::ReadOnly) - .Case("write_only", RuntimeMD::KernelArg::WriteOnly) - .Case("read_write", RuntimeMD::KernelArg::ReadWrite) - .Default(RuntimeMD::KernelArg::None); - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAccQual, - AQ, 1); - - // Emit KeyArgAddrQual. - if (isa<PointerType>(T)) - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAddrQual, - T->getPointerAddressSpace(), 1); - - // Emit KeyArgEnd - OutStreamer->EmitIntValue(RuntimeMD::KeyArgEnd, 1); - } - - // Emit KeyReqdWorkGroupSize, KeyWorkGroupSizeHint, and KeyVecTypeHint. - if (auto RWGS = F.getMetadata("reqd_work_group_size")) - emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyReqdWorkGroupSize, - RWGS, 4); - if (auto WGSH = F.getMetadata("work_group_size_hint")) - emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyWorkGroupSizeHint, - WGSH, 4); - if (auto VTH = F.getMetadata("vec_type_hint")) { - auto TypeName = getOCLTypeName(cast<ValueAsMetadata>( - VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>( - VTH->getOperand(1))->getZExtValue()); - emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyVecTypeHint, - TypeName); - } - - // Emit KeyKernelEnd - OutStreamer->EmitIntValue(RuntimeMD::KeyKernelEnd, 1); -} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 7b04c53..9a4bafe 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -15,10 +15,13 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H +#include "AMDGPUMCInstLower.h" + #include "llvm/CodeGen/AsmPrinter.h" #include <vector> namespace llvm { +class MCOperand; class AMDGPUAsmPrinter final : public AsmPrinter { private: @@ -40,6 +43,8 @@ private: NumVGPR(0), NumSGPR(0), FlatUsed(false), + NumSGPRsForWavesPerEU(0), + NumVGPRsForWavesPerEU(0), ReservedVGPRFirst(0), ReservedVGPRCount(0), DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1), @@ -71,15 +76,23 @@ private: uint32_t LDSSize; bool FlatUsed; + // Number of SGPRs that meets number of waves per execution unit request. + uint32_t NumSGPRsForWavesPerEU; + + // Number of VGPRs that meets number of waves per execution unit request. + uint32_t NumVGPRsForWavesPerEU; + // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first // fixed VGPR number reserved. uint16_t ReservedVGPRFirst; + // The number of consecutive VGPRs reserved. uint16_t ReservedVGPRCount; // Fixed SGPR number used to hold wave scratch offset for entire kernel // execution, or uint16_t(-1) if the register is not used or not known. uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR; + // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire // kernel execution, or uint16_t(-1) if the register is not used or not // known. @@ -108,9 +121,16 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { - return "AMDGPU Assembly Printer"; - } + StringRef getPassName() const override; + + /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated + /// pseudo lowering. + bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const; + + /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo + /// instructions. + bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, + const MachineInstr *MI); /// Implemented in AMDGPUMCInstLower.cpp void EmitInstruction(const MachineInstr *MI) override; @@ -123,14 +143,13 @@ public: void EmitStartOfAsmFile(Module &M) override; + bool isBlockOnlyReachableByFallthrough( + const MachineBasicBlock *MBB) const override; + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) override; - void emitStartOfRuntimeMetadata(const Module &M); - - void emitRuntimeMetadata(const Function &F); - protected: std::vector<std::string> DisasmLines, HexLines; size_t DisasmLineMaxLen; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 1a1da8a..d53cc15 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -1,4 +1,4 @@ -//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering ---===// +//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// // // The LLVM Compiler Infrastructure // @@ -34,9 +34,9 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, return true; } -bool AMDGPUCallLowering::lowerFormalArguments( - MachineIRBuilder &MIRBuilder, const Function::ArgumentListType &Args, - const SmallVectorImpl<unsigned> &VRegs) const { +bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, + const Function &F, + ArrayRef<unsigned> VRegs) const { // TODO: Implement once there are generic loads/stores. return true; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h index 61174ba..9ae87c9 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -27,10 +27,8 @@ class AMDGPUCallLowering: public CallLowering { bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val, unsigned VReg) const override; - bool - lowerFormalArguments(MachineIRBuilder &MIRBuilder, - const Function::ArgumentListType &Args, - const SmallVectorImpl<unsigned> &VRegs) const override; + bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef<unsigned> VRegs) const override; }; } // End of namespace llvm; #endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index b955e23..e623054 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -39,6 +39,78 @@ class AMDGPUCodeGenPrepare : public FunctionPass, Module *Mod; bool HasUnsafeFPMath; + /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to + /// binary operation \p V. + /// + /// \returns Binary operation \p V. + Value *copyFlags(const BinaryOperator &I, Value *V) const; + + /// \returns \p T's base element bit width. + unsigned getBaseElementBitWidth(const Type *T) const; + + /// \returns Equivalent 32 bit integer type for given type \p T. For example, + /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> + /// is returned. + Type *getI32Ty(IRBuilder<> &B, const Type *T) const; + + /// \returns True if binary operation \p I is a signed binary operation, false + /// otherwise. + bool isSigned(const BinaryOperator &I) const; + + /// \returns True if the condition of 'select' operation \p I comes from a + /// signed 'icmp' operation, false otherwise. + bool isSigned(const SelectInst &I) const; + + /// \returns True if type \p T needs to be promoted to 32 bit integer type, + /// false otherwise. + bool needsPromotionToI32(const Type *T) const; + + /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary + /// operation. + /// + /// \details \p I's base element bit width must be greater than 1 and less + /// than or equal 16. Promotion is done by sign or zero extending operands to + /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and + /// truncating the result of 32 bit binary operation back to \p I's original + /// type. Division operation is not promoted. + /// + /// \returns True if \p I is promoted to equivalent 32 bit binary operation, + /// false otherwise. + bool promoteUniformOpToI32(BinaryOperator &I) const; + + /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. + /// + /// \details \p I's base element bit width must be greater than 1 and less + /// than or equal 16. Promotion is done by sign or zero extending operands to + /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. + /// + /// \returns True. + bool promoteUniformOpToI32(ICmpInst &I) const; + + /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' + /// operation. + /// + /// \details \p I's base element bit width must be greater than 1 and less + /// than or equal 16. Promotion is done by sign or zero extending operands to + /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the + /// result of 32 bit 'select' operation back to \p I's original type. + /// + /// \returns True. + bool promoteUniformOpToI32(SelectInst &I) const; + + /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' + /// intrinsic. + /// + /// \details \p I's base element bit width must be greater than 1 and less + /// than or equal 16. Promotion is done by zero extending the operand to 32 + /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the + /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the + /// shift amount is 32 minus \p I's base element bit width), and truncating + /// the result of the shift operation back to \p I's original type. + /// + /// \returns True. + bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; + public: static char ID; AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : @@ -51,16 +123,18 @@ public: bool visitFDiv(BinaryOperator &I); - bool visitInstruction(Instruction &I) { - return false; - } + bool visitInstruction(Instruction &I) { return false; } + bool visitBinaryOperator(BinaryOperator &I); + bool visitICmpInst(ICmpInst &I); + bool visitSelectInst(SelectInst &I); + + bool visitIntrinsicInst(IntrinsicInst &I); + bool visitBitreverseIntrinsicInst(IntrinsicInst &I); bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; - const char *getPassName() const override { - return "AMDGPU IR optimizations"; - } + StringRef getPassName() const override { return "AMDGPU IR optimizations"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DivergenceAnalysis>(); @@ -70,6 +144,171 @@ public: } // End anonymous namespace +Value *AMDGPUCodeGenPrepare::copyFlags( + const BinaryOperator &I, Value *V) const { + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V); + if (!BinOp) // Possibly constant expression. + return V; + + if (isa<OverflowingBinaryOperator>(BinOp)) { + BinOp->setHasNoSignedWrap(I.hasNoSignedWrap()); + BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); + } else if (isa<PossiblyExactOperator>(BinOp)) + BinOp->setIsExact(I.isExact()); + + return V; +} + +unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { + assert(needsPromotionToI32(T) && "T does not need promotion to i32"); + + if (T->isIntegerTy()) + return T->getIntegerBitWidth(); + return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); +} + +Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { + assert(needsPromotionToI32(T) && "T does not need promotion to i32"); + + if (T->isIntegerTy()) + return B.getInt32Ty(); + return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); +} + +bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { + return I.getOpcode() == Instruction::AShr || + I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; +} + +bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { + return isa<ICmpInst>(I.getOperand(0)) ? + cast<ICmpInst>(I.getOperand(0))->isSigned() : false; +} + +bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { + if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 && + T->getIntegerBitWidth() <= 16) + return true; + if (!T->isVectorTy()) + return false; + return needsPromotionToI32(cast<VectorType>(T)->getElementType()); +} + +bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { + assert(needsPromotionToI32(I.getType()) && + "I does not need promotion to i32"); + + if (I.getOpcode() == Instruction::SDiv || + I.getOpcode() == Instruction::UDiv) + return false; + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = getI32Ty(Builder, I.getType()); + Value *ExtOp0 = nullptr; + Value *ExtOp1 = nullptr; + Value *ExtRes = nullptr; + Value *TruncRes = nullptr; + + if (isSigned(I)) { + ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); + ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); + } else { + ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); + ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); + } + ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); + TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); + + I.replaceAllUsesWith(TruncRes); + I.eraseFromParent(); + + return true; +} + +bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { + assert(needsPromotionToI32(I.getOperand(0)->getType()) && + "I does not need promotion to i32"); + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); + Value *ExtOp0 = nullptr; + Value *ExtOp1 = nullptr; + Value *NewICmp = nullptr; + + if (I.isSigned()) { + ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); + ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); + } else { + ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); + ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); + } + NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); + + I.replaceAllUsesWith(NewICmp); + I.eraseFromParent(); + + return true; +} + +bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { + assert(needsPromotionToI32(I.getType()) && + "I does not need promotion to i32"); + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = getI32Ty(Builder, I.getType()); + Value *ExtOp1 = nullptr; + Value *ExtOp2 = nullptr; + Value *ExtRes = nullptr; + Value *TruncRes = nullptr; + + if (isSigned(I)) { + ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); + ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); + } else { + ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); + ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); + } + ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); + TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); + + I.replaceAllUsesWith(TruncRes); + I.eraseFromParent(); + + return true; +} + +bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( + IntrinsicInst &I) const { + assert(I.getIntrinsicID() == Intrinsic::bitreverse && + "I must be bitreverse intrinsic"); + assert(needsPromotionToI32(I.getType()) && + "I does not need promotion to i32"); + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = getI32Ty(Builder, I.getType()); + Function *I32 = + Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); + Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); + Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); + Value *LShrOp = + Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); + Value *TruncRes = + Builder.CreateTrunc(LShrOp, I.getType()); + + I.replaceAllUsesWith(TruncRes); + I.eraseFromParent(); + + return true; +} + static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); if (!CNum) @@ -85,7 +324,6 @@ static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Type *Ty = FDiv.getType(); - // TODO: Handle half if (!Ty->getScalarType()->isFloatTy()) return false; @@ -154,6 +392,55 @@ static bool hasUnsafeFPMath(const Function &F) { return Attr.getValueAsString() == "true"; } +bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { + bool Changed = false; + + if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && + DA->isUniform(&I)) + Changed |= promoteUniformOpToI32(I); + + return Changed; +} + +bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { + bool Changed = false; + + if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && + DA->isUniform(&I)) + Changed |= promoteUniformOpToI32(I); + + return Changed; +} + +bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { + bool Changed = false; + + if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && + DA->isUniform(&I)) + Changed |= promoteUniformOpToI32(I); + + return Changed; +} + +bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { + switch (I.getIntrinsicID()) { + case Intrinsic::bitreverse: + return visitBitreverseIntrinsicInst(I); + default: + return false; + } +} + +bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { + bool Changed = false; + + if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && + DA->isUniform(&I)) + Changed |= promoteUniformBitreverseToI32(I); + + return Changed; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Mod = &M; return false; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp index bbc28b8..805fb71 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -10,23 +10,22 @@ // Interface to describe a layout of a stack frame on a AMDGPU target machine. // //===----------------------------------------------------------------------===// + #include "AMDGPUFrameLowering.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" - +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Instructions.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, unsigned TransAl) : TargetFrameLowering(D, StackAl, LAO, TransAl) { } -AMDGPUFrameLowering::~AMDGPUFrameLowering() { } +AMDGPUFrameLowering::~AMDGPUFrameLowering() = default; unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { - // XXX: Hardcoding to 1 for now. // // I think the StackWidth should stored as metadata associated with the @@ -75,7 +74,7 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); const AMDGPURegisterInfo *RI = MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo(); @@ -86,19 +85,18 @@ int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF, // XXX: We should only do this when the shader actually uses this // information. unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4); - int UpperBound = FI == -1 ? MFI->getNumObjects() : FI; + int UpperBound = FI == -1 ? MFI.getNumObjects() : FI; - for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { - OffsetBytes = alignTo(OffsetBytes, MFI->getObjectAlignment(i)); - OffsetBytes += MFI->getObjectSize(i); + for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) { + OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i)); + OffsetBytes += MFI.getObjectSize(i); // Each register holds 4 bytes, so we must always align the offset to at // least 4 bytes, so that 2 frame objects won't share the same register. OffsetBytes = alignTo(OffsetBytes, 4); } if (FI != -1) - OffsetBytes = alignTo(OffsetBytes, MFI->getObjectAlignment(FI)); + OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI)); return OffsetBytes / (getStackWidth(MF) * 4); } - diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 513848a..5d51351 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -11,6 +11,7 @@ /// \brief Interface to describe a layout of a stack frame on an AMDGPU target. // //===----------------------------------------------------------------------===// + #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H @@ -27,7 +28,7 @@ class AMDGPUFrameLowering : public TargetFrameLowering { public: AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, unsigned TransAl = 1); - virtual ~AMDGPUFrameLowering(); + ~AMDGPUFrameLowering() override; /// \returns The number of 32-bit sub-registers that are used when storing /// values to the stack. @@ -40,5 +41,7 @@ public: return false; } }; -} // namespace llvm -#endif + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 23c9352..5bf347e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -12,25 +12,48 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPU.h" #include "AMDGPUInstrInfo.h" -#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPURegisterInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" #include "SIISelLowering.h" #include "SIMachineFunctionInfo.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Instruction.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include <cassert> +#include <cstdint> +#include <new> +#include <vector> using namespace llvm; namespace llvm { + class R600InstrInfo; -} + +} // end namespace llvm //===----------------------------------------------------------------------===// // Instruction Selector Implementation @@ -38,18 +61,6 @@ class R600InstrInfo; namespace { -static bool isCBranchSCC(const SDNode *N) { - assert(N->getOpcode() == ISD::BRCOND); - if (!N->hasOneUse()) - return false; - - SDValue Cond = N->getOperand(1); - if (Cond.getOpcode() == ISD::CopyToReg) - Cond = Cond.getOperand(2); - return Cond.getOpcode() == ISD::SETCC && - Cond.getOperand(0).getValueType() == MVT::i32 && Cond.hasOneUse(); -} - /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -58,16 +69,18 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { const AMDGPUSubtarget *Subtarget; public: - AMDGPUDAGToDAGISel(TargetMachine &TM); - virtual ~AMDGPUDAGToDAGISel(); + explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel) + : SelectionDAGISel(TM, OptLevel) {} + ~AMDGPUDAGToDAGISel() override = default; + bool runOnMachineFunction(MachineFunction &MF) override; void Select(SDNode *N) override; - const char *getPassName() const override; - void PreprocessISelDAG() override; + StringRef getPassName() const override; void PostprocessISelDAG() override; private: - bool isInlineImmediate(SDNode *N) const; + SDValue foldFrameIndex(SDValue N) const; + bool isInlineImmediate(const SDNode *N) const; bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, const R600InstrInfo *TII); bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &); @@ -145,40 +158,46 @@ private: void SelectADD_SUB_I64(SDNode *N); void SelectDIV_SCALE(SDNode *N); + void SelectFMA_W_CHAIN(SDNode *N); + void SelectFMUL_W_CHAIN(SDNode *N); SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width); void SelectS_BFEFromShifts(SDNode *N); void SelectS_BFE(SDNode *N); + bool isCBranchSCC(const SDNode *N) const; void SelectBRCOND(SDNode *N); void SelectATOMIC_CMP_SWAP(SDNode *N); // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" }; + } // end anonymous namespace /// \brief This pass converts a legalized DAG into a AMDGPU-specific // DAG, ready for instruction scheduling. -FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) { - return new AMDGPUDAGToDAGISel(TM); +FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM, + CodeGenOpt::Level OptLevel) { + return new AMDGPUDAGToDAGISel(TM, OptLevel); } -AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM) - : SelectionDAGISel(TM) {} - bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<AMDGPUSubtarget>(); return SelectionDAGISel::runOnMachineFunction(MF); } -AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() { -} +bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { + const SIInstrInfo *TII + = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo(); + + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) + return TII->isInlineConstant(C->getAPIntValue()); + + if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) + return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); -bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const { - const SITargetLowering *TL - = static_cast<const SITargetLowering *>(getTargetLowering()); - return TL->analyzeImmediate(N) == 0; + return false; } /// \brief Determine the register class for \p OpNo @@ -187,8 +206,21 @@ bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const { /// determined. const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, unsigned OpNo) const { - if (!N->isMachineOpcode()) + if (!N->isMachineOpcode()) { + if (N->getOpcode() == ISD::CopyToReg) { + unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo(); + return MRI.getRegClass(Reg); + } + + const SIRegisterInfo *TRI + = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo(); + return TRI->getPhysRegClass(Reg); + } + return nullptr; + } switch (N->getMachineOpcode()) { default: { @@ -244,7 +276,7 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { switch (NumVectorElts) { case 1: - return AMDGPU::SReg_32RegClassID; + return AMDGPU::SReg_32_XM0RegClassID; case 2: return AMDGPU::SReg_64RegClassID; case 4: @@ -275,7 +307,11 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { // DAG legalization, so we can fold some i64 ADDs used for address // calculation into the LOAD and STORE instructions. case ISD::ADD: - case ISD::SUB: { + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUB: + case ISD::SUBC: + case ISD::SUBE: { if (N->getValueType(0) != MVT::i64 || Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; @@ -283,6 +319,15 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectADD_SUB_I64(N); return; } + case AMDGPUISD::FMUL_W_CHAIN: { + SelectFMUL_W_CHAIN(N); + return; + } + case AMDGPUISD::FMA_W_CHAIN: { + SelectFMA_W_CHAIN(N); + return; + } + case ISD::SCALAR_TO_VECTOR: case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { @@ -498,7 +543,7 @@ bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { Term->getMetadata("structurizecfg.uniform"); } -const char *AMDGPUDAGToDAGISel::getPassName() const { +StringRef AMDGPUDAGToDAGISel::getPassName() const { return "AMDGPU DAG->DAG Pattern Instruction Selection"; } @@ -563,6 +608,10 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, if ((C = dyn_cast<ConstantSDNode>(Addr))) { Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && + (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { + Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { Base = Addr.getOperand(0); @@ -580,7 +629,12 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - bool IsAdd = (N->getOpcode() == ISD::ADD); + unsigned Opcode = N->getOpcode(); + bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE); + bool ProduceCarry = + ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC; + bool IsAdd = + (Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE); SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); @@ -596,25 +650,70 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { DL, MVT::i32, RHS, Sub1); SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); - SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs); - SDValue Carry(AddLo, 1); - SDNode *AddHi - = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32, - SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); + SDNode *AddLo; + if (!ConsumeCarry) { + SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; + AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args); + } else { + SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) }; + AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args); + } + SDValue AddHiArgs[] = { + SDValue(Hi0, 0), + SDValue(Hi1, 0), + SDValue(AddLo, 1) + }; + SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs); - SDValue Args[5] = { + SDValue RegSequenceArgs[] = { CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), SDValue(AddLo,0), Sub0, SDValue(AddHi,0), Sub1, }; - CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); + SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs); + + if (ProduceCarry) { + // Replace the carry-use + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1)); + } + + // Replace the remaining uses. + CurDAG->ReplaceAllUsesWith(N, RegSequence); + CurDAG->RemoveDeadNode(N); +} + +void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { + SDLoc SL(N); + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod + SDValue Ops[10]; + + SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); + SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]); + Ops[8] = N->getOperand(0); + Ops[9] = N->getOperand(4); + + CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops); +} + +void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { + SDLoc SL(N); + // src0_modifiers, src0, src1_modifiers, src1, clamp, omod + SDValue Ops[8]; + + SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]); + SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); + Ops[6] = N->getOperand(0); + Ops[7] = N->getOperand(3); + + CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops); } // We need to handle this here because tablegen doesn't support matching @@ -628,14 +727,8 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { unsigned Opc = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; - // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, - // omod - SDValue Ops[8]; - - SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); - SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); - SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); - CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) }; + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, @@ -779,6 +872,9 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, } // default case + + // FIXME: This is broken on SI where we still need to check if the base + // pointer is positive here. Base = Addr; Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8); Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8); @@ -825,7 +921,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, Ptr = N2; VAddr = N3; } else { - // (add N0, C1) -> offset VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); Ptr = N0; @@ -903,6 +998,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); } +SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { + if (auto FI = dyn_cast<FrameIndexSDNode>(N)) + return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); + return N; +} + bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const { @@ -922,14 +1023,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, // Offsets in vaddr must be positive. ConstantSDNode *C1 = cast<ConstantSDNode>(N1); if (isLegalMUBUFImmOffset(C1)) { - VAddr = N0; + VAddr = foldFrameIndex(N0); ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; } } // (node) - VAddr = Addr; + VAddr = foldFrameIndex(Addr); ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); return true; } @@ -1122,7 +1223,6 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool &Imm) const { - SDLoc SL(Addr); if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); @@ -1327,36 +1427,53 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { SelectCode(N); } +bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const { + assert(N->getOpcode() == ISD::BRCOND); + if (!N->hasOneUse()) + return false; + + SDValue Cond = N->getOperand(1); + if (Cond.getOpcode() == ISD::CopyToReg) + Cond = Cond.getOperand(2); + + if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse()) + return false; + + MVT VT = Cond.getOperand(0).getSimpleValueType(); + if (VT == MVT::i32) + return true; + + if (VT == MVT::i64) { + auto ST = static_cast<const SISubtarget *>(Subtarget); + + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64(); + } + + return false; +} + void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { SDValue Cond = N->getOperand(1); + if (Cond.isUndef()) { + CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other, + N->getOperand(2), N->getOperand(0)); + return; + } + if (isCBranchSCC(N)) { // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it. SelectCode(N); return; } - // The result of VOPC instructions is or'd against ~EXEC before it is - // written to vcc or another SGPR. This means that the value '1' is always - // written to the corresponding bit for results that are masked. In order - // to correctly check against vccz, we need to and VCC with the EXEC - // register in order to clear the value from the masked bits. - SDLoc SL(N); - SDNode *MaskedCond = - CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1, - CurDAG->getRegister(AMDGPU::EXEC, MVT::i1), - Cond); - SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, - SDValue(MaskedCond, 0), - SDValue()); // Passing SDValue() adds a - // glue output. + SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, Cond); CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other, N->getOperand(2), // Basic Block - VCC.getValue(0), // Chain - VCC.getValue(1)); // Glue - return; + VCC.getValue(0)); } // This is here because there isn't a way to use the generated sub0_sub1 as the @@ -1427,7 +1544,6 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { - unsigned Mods = 0; Src = In; @@ -1491,62 +1607,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } -void AMDGPUDAGToDAGISel::PreprocessISelDAG() { - MachineFrameInfo *MFI = CurDAG->getMachineFunction().getFrameInfo(); - - // Handle the perverse case where a frame index is being stored. We don't - // want to see multiple frame index operands on the same instruction since - // it complicates things and violates some assumptions about frame index - // lowering. - for (int I = MFI->getObjectIndexBegin(), E = MFI->getObjectIndexEnd(); - I != E; ++I) { - SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32); - - // It's possible that we have a frame index defined in the function that - // isn't used in this block. - if (FI.use_empty()) - continue; - - // Skip over the AssertZext inserted during lowering. - SDValue EffectiveFI = FI; - auto It = FI->use_begin(); - if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) { - EffectiveFI = SDValue(*It, 0); - It = EffectiveFI->use_begin(); - } - - for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) { - SDUse &Use = It.getUse(); - SDNode *User = Use.getUser(); - unsigned OpIdx = It.getOperandNo(); - ++It; - - if (MemSDNode *M = dyn_cast<MemSDNode>(User)) { - unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1; - if (OpIdx == PtrIdx) - continue; - - unsigned OpN = M->getNumOperands(); - SDValue NewOps[8]; - - assert(OpN < array_lengthof(NewOps)); - for (unsigned Op = 0; Op != OpN; ++Op) { - if (Op != OpIdx) { - NewOps[Op] = M->getOperand(Op); - continue; - } - - MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - SDLoc(M), MVT::i32, FI); - NewOps[Op] = SDValue(Mov, 0); - } - - CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN)); - } - } - } -} - void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 352423ed..54caa2c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -37,7 +37,7 @@ static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, MachineFunction &MF = State.getMachineFunction(); AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); - uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(), + uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(), ArgFlags.getOrigAlign()); State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); return true; @@ -55,14 +55,6 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } -EVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) { - unsigned StoreSize = VT.getStoreSizeInBits(); - if (StoreSize <= 32) - return EVT::getIntegerVT(Ctx, StoreSize); - - return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); -} - AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -180,16 +172,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v2f64, Promote); AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); - setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); - setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); - - setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); - setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); - - setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); - setTruncStoreAction(MVT::i64, MVT::i1, Expand); setTruncStoreAction(MVT::i64, MVT::i8, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); @@ -287,6 +269,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, } setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { @@ -367,6 +350,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::MUL, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::OR, VT, Expand); setOperationAction(ISD::SHL, VT, Expand); setOperationAction(ISD::SRA, VT, Expand); @@ -440,22 +425,31 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v4f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); + // There are no libcalls of any kind. + for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) + setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr); + setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); + // FIXME: This is only partially true. If we have to do vector compares, any + // SGPR pair can be a condition register. If we have a uniform condition, we + // are better off doing SALU operations, where there is only one SCC. For now, + // we don't have a way of knowing during instruction selection if a condition + // will be uniform and we always use vector compares. Assume we are using + // vector compares until that is fixed. + setHasMultipleConditionRegisters(true); + // SI at least has hardware support for floating point exceptions, but no way // of using or handling them is implemented. They are also optional in OpenCL // (Section 7.3) setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); - setSelectIsExpensive(false); PredictableSelectIsExpensive = false; - setFsqrtIsCheap(true); - // We want to find all load dependencies for long chains of stores to enable // merging into very wide vectors. The problem is with vectors with > 4 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 @@ -472,22 +466,42 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MaxStoresPerMemset = 4096; setTargetDAGCombine(ISD::BITCAST); - setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::MULHU); + setTargetDAGCombine(ISD::MULHS); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FNEG); } //===----------------------------------------------------------------------===// // Target Information //===----------------------------------------------------------------------===// +static bool fnegFoldsIntoOp(unsigned Opc) { + switch (Opc) { + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FMA: + case ISD::FMAD: + case ISD::FSIN: + case AMDGPUISD::RCP: + case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::SIN_HW: + case AMDGPUISD::FMUL_LEGACY: + return true; + default: + return false; + } +} + MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { return MVT::i32; } @@ -500,7 +514,8 @@ bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { // FIXME: Why are we reporting vectors of FP immediates as legal? bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { EVT ScalarVT = VT.getScalarType(); - return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64); + return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 || + (ScalarVT == MVT::f16 && Subtarget->has16BitInsts())); } // We don't want to shrink f64 / f32 constants. @@ -565,12 +580,12 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64; + return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() && + VT == MVT::f16); } bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { - assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64; + return isFAbsFree(VT); } bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, @@ -593,19 +608,32 @@ bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { // Truncate is just accessing a subregister. - return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); + + unsigned SrcSize = Source.getSizeInBits(); + unsigned DestSize = Dest.getSizeInBits(); + + return DestSize < SrcSize && DestSize % 32 == 0 ; } bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { // Truncate is just accessing a subregister. - return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && - (Dest->getPrimitiveSizeInBits() % 32 == 0); + + unsigned SrcSize = Source->getScalarSizeInBits(); + unsigned DestSize = Dest->getScalarSizeInBits(); + + if (DestSize== 16 && Subtarget->has16BitInsts()) + return SrcSize >= 32; + + return DestSize < SrcSize && DestSize % 32 == 0; } bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { unsigned SrcSize = Src->getScalarSizeInBits(); unsigned DestSize = Dest->getScalarSizeInBits(); + if (SrcSize == 16 && Subtarget->has16BitInsts()) + return DestSize >= 32; + return SrcSize == 32 && DestSize == 64; } @@ -614,6 +642,10 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { // practical purposes, the extra mov 0 to load a 64-bit is free. As used, // this will enable reducing 64-bit operations the 32-bit, which is always // good. + + if (Src == MVT::i16) + return Dest == MVT::i32 ||Dest == MVT::i64 ; + return Src == MVT::i32 && Dest == MVT::i64; } @@ -635,9 +667,105 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // TargetLowering Callbacks //===---------------------------------------------------------------------===// -void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, +/// The SelectionDAGBuilder will automatically promote function arguments +/// with illegal types. However, this does not work for the AMDGPU targets +/// since the function arguments are stored in memory as these illegal types. +/// In order to handle this properly we need to get the original types sizes +/// from the LLVM IR Function and fixup the ISD:InputArg values before +/// passing them to AnalyzeFormalArguments() + +/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting +/// input values across multiple registers. Each item in the Ins array +/// represents a single value that will be stored in regsters. Ins[x].VT is +/// the value type of the value that will be stored in the register, so +/// whatever SDNode we lower the argument to needs to be this type. +/// +/// In order to correctly lower the arguments we need to know the size of each +/// argument. Since Ins[x].VT gives us the size of the register that will +/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type +/// for the orignal function argument so that we can deduce the correct memory +/// type to use for Ins[x]. In most cases the correct memory type will be +/// Ins[x].ArgVT. However, this will not always be the case. If, for example, +/// we have a kernel argument of type v8i8, this argument will be split into +/// 8 parts and each part will be represented by its own item in the Ins array. +/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of +/// the argument before it was split. From this, we deduce that the memory type +/// for each individual part is i8. We pass the memory type as LocVT to the +/// calling convention analysis function and the register type (Ins[x].VT) as +/// the ValVT. +void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl<ISD::InputArg> &Ins) const { + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + const ISD::InputArg &In = Ins[i]; + EVT MemVT; + + unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT); + + if (!Subtarget->isAmdHsaOS() && + (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) { + // The ABI says the caller will extend these values to 32-bits. + MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32; + } else if (NumRegs == 1) { + // This argument is not split, so the IR type is the memory type. + assert(!In.Flags.isSplit()); + if (In.ArgVT.isExtended()) { + // We have an extended type, like i24, so we should just use the register type + MemVT = In.VT; + } else { + MemVT = In.ArgVT; + } + } else if (In.ArgVT.isVector() && In.VT.isVector() && + In.ArgVT.getScalarType() == In.VT.getScalarType()) { + assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements()); + // We have a vector value which has been split into a vector with + // the same scalar type, but fewer elements. This should handle + // all the floating-point vector types. + MemVT = In.VT; + } else if (In.ArgVT.isVector() && + In.ArgVT.getVectorNumElements() == NumRegs) { + // This arg has been split so that each element is stored in a separate + // register. + MemVT = In.ArgVT.getScalarType(); + } else if (In.ArgVT.isExtended()) { + // We have an extended type, like i65. + MemVT = In.VT; + } else { + unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs; + assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0); + if (In.VT.isInteger()) { + MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); + } else if (In.VT.isVector()) { + assert(!In.VT.getScalarType().isFloatingPoint()); + unsigned NumElements = In.VT.getVectorNumElements(); + assert(MemoryBits % NumElements == 0); + // This vector type has been split into another vector type with + // a different elements size. + EVT ScalarVT = EVT::getIntegerVT(State.getContext(), + MemoryBits / NumElements); + MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); + } else { + llvm_unreachable("cannot deduce memory type."); + } + } + // Convert one element vectors to scalar. + if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) + MemVT = MemVT.getScalarType(); + + if (MemVT.isExtended()) { + // This should really only happen if we have vec3 arguments + assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); + MemVT = MemVT.getPow2VectorType(State.getContext()); + } + + assert(MemVT.isSimple()); + allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags, + State); + } +} + +void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const { State.AnalyzeFormalArguments(Ins, CC_AMDGPU); } @@ -678,8 +806,10 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc()); DAG.getContext()->diagnose(NoCalls); - for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) - InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); + if (!CLI.IsTailCall) { + for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) + InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); + } return DAG.getEntryNode(); } @@ -718,6 +848,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::CTLZ: @@ -745,94 +876,6 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, } } -// FIXME: This implements accesses to initialized globals in the constant -// address space by copying them to private and accessing that. It does not -// properly handle illegal types or vectors. The private vector loads are not -// scalarized, and the illegal scalars hit an assertion. This technique will not -// work well with large initializers, and this should eventually be -// removed. Initialized globals should be placed into a data section that the -// runtime will load into a buffer before the kernel is executed. Uses of the -// global need to be replaced with a pointer loaded from an implicit kernel -// argument into this buffer holding the copy of the data, which will remove the -// need for any of this. -SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, - const GlobalValue *GV, - const SDValue &InitPtr, - SDValue Chain, - SelectionDAG &DAG) const { - const DataLayout &TD = DAG.getDataLayout(); - SDLoc DL(InitPtr); - Type *InitTy = Init->getType(); - - if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { - EVT VT = EVT::getEVT(InitTy); - PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); - return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), - TD.getPrefTypeAlignment(InitTy)); - } - - if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { - EVT VT = EVT::getEVT(CFP->getType()); - PointerType *PtrTy = PointerType::get(CFP->getType(), 0); - return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), - TD.getPrefTypeAlignment(CFP->getType())); - } - - if (StructType *ST = dyn_cast<StructType>(InitTy)) { - const StructLayout *SL = TD.getStructLayout(ST); - - EVT PtrVT = InitPtr.getValueType(); - SmallVector<SDValue, 8> Chains; - - for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { - SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); - - Constant *Elt = Init->getAggregateElement(I); - Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); - } - - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - } - - if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) { - EVT PtrVT = InitPtr.getValueType(); - - unsigned NumElements; - if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy)) - NumElements = AT->getNumElements(); - else if (VectorType *VT = dyn_cast<VectorType>(SeqTy)) - NumElements = VT->getNumElements(); - else - llvm_unreachable("Unexpected type"); - - unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType()); - SmallVector<SDValue, 8> Chains; - for (unsigned i = 0; i < NumElements; ++i) { - SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); - - Constant *Elt = Init->getAggregateElement(i); - Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); - } - - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - } - - if (isa<UndefValue>(Init)) { - EVT VT = EVT::getEVT(InitTy); - PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); - return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), - TD.getPrefTypeAlignment(InitTy)); - } - - Init->dump(); - llvm_unreachable("Unhandled constant initializer"); -} - static bool hasDefinedInitializer(const GlobalValue *GV) { const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); if (!GVar || !GVar->hasInitializer()) @@ -850,11 +893,6 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, const GlobalValue *GV = G->getGlobal(); switch (G->getAddressSpace()) { - case AMDGPUAS::CONSTANT_ADDRESS: { - MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); - SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(G), ConstPtrVT); - return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(G), ConstPtrVT, GA); - } case AMDGPUAS::LOCAL_ADDRESS: { // XXX: What does the value of G->getOffset() mean? assert(G->getOffset() == 0 && @@ -864,24 +902,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, if (hasDefinedInitializer(GV)) break; - unsigned Offset; - if (MFI->LocalMemoryObjects.count(GV) == 0) { - unsigned Align = GV->getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV->getValueType()); - - /// TODO: We should sort these to minimize wasted space due to alignment - /// padding. Currently the padding is decided by the first encountered use - /// during lowering. - Offset = MFI->LDSSize = alignTo(MFI->LDSSize, Align); - MFI->LocalMemoryObjects[GV] = Offset; - MFI->LDSSize += DL.getTypeAllocSize(GV->getValueType()); - } else { - Offset = MFI->LocalMemoryObjects[GV]; - } - - return DAG.getConstant(Offset, SDLoc(Op), - getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS)); + unsigned Offset = MFI->allocateLDSGlobal(DL, *GV); + return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); } } @@ -1097,65 +1119,6 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, return DAG.getMergeValues(Ops, SL); } -// FIXME: This isn't doing anything for SI. This should be used in a target -// combine during type legalization. -SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, - SelectionDAG &DAG) const { - StoreSDNode *Store = cast<StoreSDNode>(Op); - EVT MemVT = Store->getMemoryVT(); - unsigned MemBits = MemVT.getSizeInBits(); - - // Byte stores are really expensive, so if possible, try to pack 32-bit vector - // truncating store into an i32 store. - // XXX: We could also handle optimize other vector bitwidths. - if (!MemVT.isVector() || MemBits > 32) { - return SDValue(); - } - - SDLoc DL(Op); - SDValue Value = Store->getValue(); - EVT VT = Value.getValueType(); - EVT ElemVT = VT.getVectorElementType(); - SDValue Ptr = Store->getBasePtr(); - EVT MemEltVT = MemVT.getVectorElementType(); - unsigned MemEltBits = MemEltVT.getSizeInBits(); - unsigned MemNumElements = MemVT.getVectorNumElements(); - unsigned PackedSize = MemVT.getStoreSizeInBits(); - SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32); - - assert(Value.getValueType().getScalarSizeInBits() >= 32); - - SDValue PackedValue; - for (unsigned i = 0; i < MemNumElements; ++i) { - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, - DAG.getConstant(i, DL, MVT::i32)); - Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); - Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg - - SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32); - Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); - - if (i == 0) { - PackedValue = Elt; - } else { - PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); - } - } - - if (PackedSize < 32) { - EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); - return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, - Store->getMemOperand()->getPointerInfo(), PackedVT, - Store->getAlignment(), - Store->getMemOperand()->getFlags()); - } - - return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, - Store->getMemOperand()->getPointerInfo(), - Store->getAlignment(), - Store->getMemOperand()->getFlags()); -} - SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast<StoreSDNode>(Op); @@ -1670,7 +1633,7 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::f64); - APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); + APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); @@ -1681,7 +1644,7 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); - APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); + APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); EVT SetCCVT = @@ -1988,14 +1951,26 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, assert(Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"); + // TODO: Factor out code common with LowerSINT_TO_FP. + EVT DestVT = Op.getValueType(); - if (DestVT == MVT::f64) - return LowerINT_TO_FP64(Op, DAG, false); + if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + + SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); + SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op)); + SDValue FPRound = + DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); + + return FPRound; + } if (DestVT == MVT::f32) return LowerINT_TO_FP32(Op, DAG, false); - return SDValue(); + assert(DestVT == MVT::f64); + return LowerINT_TO_FP64(Op, DAG, false); } SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, @@ -2003,14 +1978,26 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, assert(Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"); + // TODO: Factor out code common with LowerUINT_TO_FP. + EVT DestVT = Op.getValueType(); + if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + + SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); + SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op)); + SDValue FPRound = + DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); + + return FPRound; + } + if (DestVT == MVT::f32) return LowerINT_TO_FP32(Op, DAG, true); - if (DestVT == MVT::f64) - return LowerINT_TO_FP64(Op, DAG, true); - - return SDValue(); + assert(DestVT == MVT::f64); + return LowerINT_TO_FP64(Op, DAG, true); } SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, @@ -2042,10 +2029,118 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); } +SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { + + if (getTargetMachine().Options.UnsafeFPMath) { + // There is a generic expand for FP_TO_FP16 with unsafe fast math. + return SDValue(); + } + + SDLoc DL(Op); + SDValue N0 = Op.getOperand(0); + assert (N0.getSimpleValueType() == MVT::f64); + + // f64 -> f16 conversion using round-to-nearest-even rounding mode. + const unsigned ExpMask = 0x7ff; + const unsigned ExpBiasf64 = 1023; + const unsigned ExpBiasf16 = 15; + SDValue Zero = DAG.getConstant(0, DL, MVT::i32); + SDValue One = DAG.getConstant(1, DL, MVT::i32); + SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0); + SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U, + DAG.getConstant(32, DL, MVT::i64)); + UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32); + U = DAG.getZExtOrTrunc(U, DL, MVT::i32); + SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, + DAG.getConstant(20, DL, MVT::i64)); + E = DAG.getNode(ISD::AND, DL, MVT::i32, E, + DAG.getConstant(ExpMask, DL, MVT::i32)); + // Subtract the fp64 exponent bias (1023) to get the real exponent and + // add the f16 bias (15) to get the biased exponent for the f16 format. + E = DAG.getNode(ISD::ADD, DL, MVT::i32, E, + DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32)); + + SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, + DAG.getConstant(8, DL, MVT::i32)); + M = DAG.getNode(ISD::AND, DL, MVT::i32, M, + DAG.getConstant(0xffe, DL, MVT::i32)); + + SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH, + DAG.getConstant(0x1ff, DL, MVT::i32)); + MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U); + + SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ); + M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set); + + // (M != 0 ? 0x0200 : 0) | 0x7c00; + SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32, + DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32), + Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32)); + + // N = M | (E << 12); + SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M, + DAG.getNode(ISD::SHL, DL, MVT::i32, E, + DAG.getConstant(12, DL, MVT::i32))); + + // B = clamp(1-E, 0, 13); + SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32, + One, E); + SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero); + B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B, + DAG.getConstant(13, DL, MVT::i32)); + + SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M, + DAG.getConstant(0x1000, DL, MVT::i32)); + + SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B); + SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B); + SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE); + D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1); + + SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT); + SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V, + DAG.getConstant(0x7, DL, MVT::i32)); + V = DAG.getNode(ISD::SRL, DL, MVT::i32, V, + DAG.getConstant(2, DL, MVT::i32)); + SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32), + One, Zero, ISD::SETEQ); + SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32), + One, Zero, ISD::SETGT); + V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1); + V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1); + + V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32), + DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT); + V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32), + I, V, ISD::SETEQ); + + // Extract the sign bit. + SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, + DAG.getConstant(16, DL, MVT::i32)); + Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign, + DAG.getConstant(0x8000, DL, MVT::i32)); + + V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V); + return DAG.getZExtOrTrunc(V, DL, Op.getValueType()); +} + SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); + // TODO: Factor out code common with LowerFP_TO_UINT. + + EVT SrcVT = Src.getValueType(); + if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { + SDLoc DL(Op); + + SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); + SDValue FpToInt32 = + DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); + + return FpToInt32; + } + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) return LowerFP64_TO_INT(Op, DAG, true); @@ -2056,6 +2151,19 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); + // TODO: Factor out code common with LowerFP_TO_SINT. + + EVT SrcVT = Src.getValueType(); + if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { + SDLoc DL(Op); + + SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); + SDValue FpToInt32 = + DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); + + return FpToInt32; + } + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) return LowerFP64_TO_INT(Op, DAG, false); @@ -2068,8 +2176,7 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, MVT VT = Op.getSimpleValueType(); MVT ScalarVT = VT.getScalarType(); - if (!VT.isVector()) - return SDValue(); + assert(VT.isVector()); SDValue Src = Op.getOperand(0); SDLoc DL(Op); @@ -2108,17 +2215,20 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) { (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; } -static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { +static bool simplifyI24(SDNode *Node24, unsigned OpIdx, + TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Op = Node24->getOperand(OpIdx); EVT VT = Op.getValueType(); APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, true, true); - if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) - DCI.CommitTargetLoweringOpt(TLO); + if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI)) + return true; + + return false; } template <typename IntTy> @@ -2188,6 +2298,9 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, // problems during legalization, the emitted instructions to pack and unpack // the bytes again are not eliminated in the case of an unaligned copy. if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { + if (VT.isVector()) + return scalarizeVectorLoad(LN, DAG); + SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); return DAG.getMergeValues(Ops, SDLoc(N)); @@ -2236,8 +2349,12 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, // order problems during legalization, the emitted instructions to pack and // unpack the bytes again are not eliminated in the case of an unaligned // copy. - if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) + if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { + if (VT.isVector()) + return scalarizeVectorStore(SN, DAG); + return expandUnalignedStore(SN, DAG); + } if (!IsFast) return SDValue(); @@ -2262,38 +2379,21 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SN->getBasePtr(), SN->getMemOperand()); } -// TODO: Should repeat for other bit ops. -SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - if (N->getValueType(0) != MVT::i64) - return SDValue(); - - // Break up 64-bit and of a constant into two 32-bit ands. This will typically - // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer - // combine opportunities since most 64-bit operations are decomposed this way. - // TODO: We won't want this for SALU especially if it is an inline immediate. - const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (!RHS) - return SDValue(); - - uint64_t Val = RHS->getZExtValue(); - if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) { - // If either half of the constant is 0, this is really a 32-bit and, so - // split it. If we can re-use the full materialized constant, keep it. - return SDValue(); - } - - SDLoc SL(N); +/// Split the 64-bit value \p LHS into two 32-bit components, and perform the +/// binary operation \p Opc to it with the corresponding constant operands. +SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( + DAGCombinerInfo &DCI, const SDLoc &SL, + unsigned Opc, SDValue LHS, + uint32_t ValLo, uint32_t ValHi) const { SelectionDAG &DAG = DCI.DAG; - SDValue Lo, Hi; - std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG); + std::tie(Lo, Hi) = split64BitValue(LHS, DAG); - SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32); - SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32); + SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32); - SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS); - SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS); + SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS); + SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS); // Re-visit the ands. It's possible we eliminated one of them and it could // simplify the vector. @@ -2408,11 +2508,40 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); } +// We need to specifically handle i64 mul here to avoid unnecessary conversion +// instructions. If we only match on the legalized i64 mul expansion, +// SimplifyDemandedBits will be unable to remove them because there will be +// multiple uses due to the separate mul + mulh[su]. +static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, + SDValue N0, SDValue N1, unsigned Size, bool Signed) { + if (Size <= 32) { + unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; + return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1); + } + + // Because we want to eliminate extension instructions before the + // operation, we need to create a single user here (i.e. not the separate + // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it. + + unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24; + + SDValue Mul = DAG.getNode(MulOpc, SL, + DAG.getVTList(MVT::i32, MVT::i32), N0, N1); + + return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, + Mul.getValue(0), Mul.getValue(1)); +} + SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); - if (VT.isVector() || VT.getSizeInBits() > 32) + unsigned Size = VT.getSizeInBits(); + if (VT.isVector() || Size > 64) + return SDValue(); + + // There are i16 integer mul/mad. + if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -2425,11 +2554,11 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); + Mul = getMul24(DAG, DL, N0, N1, Size, false); } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); + Mul = getMul24(DAG, DL, N0, N1, Size, true); } else { return SDValue(); } @@ -2439,6 +2568,77 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, return DAG.getSExtOrTrunc(Mul, DL, VT); } +SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + + if (!Subtarget->hasMulI24() || VT.isVector()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (!isI24(N0, DAG) || !isI24(N1, DAG)) + return SDValue(); + + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + + SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1); + DCI.AddToWorklist(Mulhi.getNode()); + return DAG.getSExtOrTrunc(Mulhi, DL, VT); +} + +SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + + if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (!isU24(N0, DAG) || !isU24(N1, DAG)) + return SDValue(); + + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + + SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1); + DCI.AddToWorklist(Mulhi.getNode()); + return DAG.getZExtOrTrunc(Mulhi, DL, VT); +} + +SDValue AMDGPUTargetLowering::performMulLoHi24Combine( + SDNode *N, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + // Simplify demanded bits before splitting into multiple users. + if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI)) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24); + + unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; + unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24; + + SDLoc SL(N); + + SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1); + SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1); + return DAG.getMergeValues({ MulLo, MulHi }, SL); +} + static bool isNegativeOne(SDValue Val) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) return C->isAllOnesValue(); @@ -2449,23 +2649,21 @@ static bool isCtlzOpc(unsigned Opc) { return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; } -// Get FFBH node if the incoming op may have been type legalized from a smaller -// type VT. -// Need to match pre-legalized type because the generic legalization inserts the -// add/sub between the select and compare. -static SDValue getFFBH_U32(const TargetLowering &TLI, SelectionDAG &DAG, - const SDLoc &SL, SDValue Op) { +SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, + SDValue Op, + const SDLoc &DL) const { EVT VT = Op.getValueType(); - EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); - if (LegalVT != MVT::i32) + EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); + if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && + LegalVT != MVT::i16)) return SDValue(); if (VT != MVT::i32) - Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op); + Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); - SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op); + SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op); if (VT != MVT::i32) - FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH); + FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH); return FFBH; } @@ -2493,7 +2691,7 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, isCtlzOpc(RHS.getOpcode()) && RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) { - return getFFBH_U32(*this, DAG, SL, CmpLHS); + return getFFBH_U32(DAG, CmpLHS, SL); } // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x @@ -2501,14 +2699,99 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, isCtlzOpc(LHS.getOpcode()) && LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) { - return getFFBH_U32(*this, DAG, SL, CmpLHS); + return getFFBH_U32(DAG, CmpLHS, SL); + } + + return SDValue(); +} + +static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, + unsigned Op, + const SDLoc &SL, + SDValue Cond, + SDValue N1, + SDValue N2) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N1.getValueType(); + + SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, + N1.getOperand(0), N2.getOperand(0)); + DCI.AddToWorklist(NewSelect.getNode()); + return DAG.getNode(Op, SL, VT, NewSelect); +} + +// Pull a free FP operation out of a select so it may fold into uses. +// +// select c, (fneg x), (fneg y) -> fneg (select c, x, y) +// select c, (fneg x), k -> fneg (select c, x, (fneg k)) +// +// select c, (fabs x), (fabs y) -> fabs (select c, x, y) +// select c, (fabs x), +k -> fabs (select c, x, k) +static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, + SDValue N) { + SelectionDAG &DAG = DCI.DAG; + SDValue Cond = N.getOperand(0); + SDValue LHS = N.getOperand(1); + SDValue RHS = N.getOperand(2); + + EVT VT = N.getValueType(); + if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || + (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { + return distributeOpThroughSelect(DCI, LHS.getOpcode(), + SDLoc(N), Cond, LHS, RHS); + } + + bool Inv = false; + if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) { + std::swap(LHS, RHS); + Inv = true; + } + + // TODO: Support vector constants. + ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); + if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) { + SDLoc SL(N); + // If one side is an fneg/fabs and the other is a constant, we can push the + // fneg/fabs down. If it's an fabs, the constant needs to be non-negative. + SDValue NewLHS = LHS.getOperand(0); + SDValue NewRHS = RHS; + + // Careful: if the neg can be folded up, don't try to pull it back down. + bool ShouldFoldNeg = true; + + if (NewLHS.hasOneUse()) { + unsigned Opc = NewLHS.getOpcode(); + if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc)) + ShouldFoldNeg = false; + if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL) + ShouldFoldNeg = false; + } + + if (ShouldFoldNeg) { + if (LHS.getOpcode() == ISD::FNEG) + NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else if (CRHS->isNegative()) + return SDValue(); + + if (Inv) + std::swap(NewLHS, NewRHS); + + SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, + Cond, NewLHS, NewRHS); + DCI.AddToWorklist(NewSelect.getNode()); + return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); + } } return SDValue(); } + SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { + if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) + return Folded; + SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); @@ -2521,6 +2804,25 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, SDValue True = N->getOperand(1); SDValue False = N->getOperand(2); + if (Cond.hasOneUse()) { // TODO: Look for multiple select uses. + SelectionDAG &DAG = DCI.DAG; + if ((DAG.isConstantValueOfAnyType(True) || + DAG.isConstantValueOfAnyType(True)) && + (!DAG.isConstantValueOfAnyType(False) && + !DAG.isConstantValueOfAnyType(False))) { + // Swap cmp + select pair to move constant to false input. + // This will allow using VOPC cndmasks more often. + // select (setcc x, y), k, x -> select (setcc y, x) x, x + + SDLoc SL(N); + ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), + LHS.getValueType().isInteger()); + + SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC); + return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True); + } + } + if (VT == MVT::f32 && Cond.hasOneUse()) { SDValue MinMax = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); @@ -2533,6 +2835,135 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); } +SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + unsigned Opc = N0.getOpcode(); + + // If the input has multiple uses and we can either fold the negate down, or + // the other uses cannot, give up. This both prevents unprofitable + // transformations and infinite loops: we won't repeatedly try to fold around + // a negate that has no 'good' form. + // + // TODO: Check users can fold + if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse()) + return SDValue(); + + SDLoc SL(N); + switch (Opc) { + case ISD::FADD: { + if (!mayIgnoreSignedZero(N0)) + return SDValue(); + + // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + + if (LHS.getOpcode() != ISD::FNEG) + LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); + else + LHS = LHS.getOperand(0); + + if (RHS.getOpcode() != ISD::FNEG) + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else + RHS = RHS.getOperand(0); + + SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + case ISD::FMUL: + case AMDGPUISD::FMUL_LEGACY: { + // (fneg (fmul x, y)) -> (fmul x, (fneg y)) + // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y)) + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + + if (LHS.getOpcode() == ISD::FNEG) + LHS = LHS.getOperand(0); + else if (RHS.getOpcode() == ISD::FNEG) + RHS = RHS.getOperand(0); + else + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + + SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + case ISD::FMA: + case ISD::FMAD: { + if (!mayIgnoreSignedZero(N0)) + return SDValue(); + + // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z)) + SDValue LHS = N0.getOperand(0); + SDValue MHS = N0.getOperand(1); + SDValue RHS = N0.getOperand(2); + + if (LHS.getOpcode() == ISD::FNEG) + LHS = LHS.getOperand(0); + else if (MHS.getOpcode() == ISD::FNEG) + MHS = MHS.getOperand(0); + else + MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS); + + if (RHS.getOpcode() != ISD::FNEG) + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else + RHS = RHS.getOperand(0); + + SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + case ISD::FP_EXTEND: + case AMDGPUISD::RCP: + case AMDGPUISD::RCP_LEGACY: + case ISD::FSIN: + case AMDGPUISD::SIN_HW: { + SDValue CvtSrc = N0.getOperand(0); + if (CvtSrc.getOpcode() == ISD::FNEG) { + // (fneg (fp_extend (fneg x))) -> (fp_extend x) + // (fneg (rcp (fneg x))) -> (rcp x) + return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0)); + } + + if (!N0.hasOneUse()) + return SDValue(); + + // (fneg (fp_extend x)) -> (fp_extend (fneg x)) + // (fneg (rcp x)) -> (rcp (fneg x)) + SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); + return DAG.getNode(Opc, SL, VT, Neg); + } + case ISD::FP_ROUND: { + SDValue CvtSrc = N0.getOperand(0); + + if (CvtSrc.getOpcode() == ISD::FNEG) { + // (fneg (fp_round (fneg x))) -> (fp_round x) + return DAG.getNode(ISD::FP_ROUND, SL, VT, + CvtSrc.getOperand(0), N0.getOperand(1)); + } + + if (!N0.hasOneUse()) + return SDValue(); + + // (fneg (fp_round x)) -> (fp_round (fneg x)) + SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); + return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); + } + default: + return SDValue(); + } +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2543,6 +2974,33 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, break; case ISD::BITCAST: { EVT DestVT = N->getValueType(0); + + // Push casts through vector builds. This helps avoid emitting a large + // number of copies when materializing floating point vector constants. + // + // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) => + // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y)) + if (DestVT.isVector()) { + SDValue Src = N->getOperand(0); + if (Src.getOpcode() == ISD::BUILD_VECTOR) { + EVT SrcVT = Src.getValueType(); + unsigned NElts = DestVT.getVectorNumElements(); + + if (SrcVT.getVectorNumElements() == NElts) { + EVT DestEltVT = DestVT.getVectorElementType(); + + SmallVector<SDValue, 8> CastedElts; + SDLoc SL(N); + for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) { + SDValue Elt = Src.getOperand(I); + CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt)); + } + + return DAG.getBuildVector(DestVT, SL, CastedElts); + } + } + } + if (DestVT.getSizeInBits() != 64 && !DestVT.isVector()) break; @@ -2591,24 +3049,28 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performSraCombine(N, DCI); } - case ISD::AND: { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) - break; - - return performAndCombine(N, DCI); - } case ISD::MUL: return performMulCombine(N, DCI); + case ISD::MULHS: + return performMulhsCombine(N, DCI); + case ISD::MULHU: + return performMulhuCombine(N, DCI); case AMDGPUISD::MUL_I24: - case AMDGPUISD::MUL_U24: { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - simplifyI24(N0, DCI); - simplifyI24(N1, DCI); + case AMDGPUISD::MUL_U24: + case AMDGPUISD::MULHI_I24: + case AMDGPUISD::MULHI_U24: { + // If the first call to simplify is successfull, then N may end up being + // deleted, so we shouldn't call simplifyI24 again. + simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI); return SDValue(); } + case AMDGPUISD::MUL_LOHI_I24: + case AMDGPUISD::MUL_LOHI_U24: + return performMulLoHi24Combine(N, DCI); case ISD::SELECT: return performSelectCombine(N, DCI); + case ISD::FNEG: + return performFNegCombine(N, DCI); case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && @@ -2705,38 +3167,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, // Helper functions //===----------------------------------------------------------------------===// -void AMDGPUTargetLowering::getOriginalFunctionArgs( - SelectionDAG &DAG, - const Function *F, - const SmallVectorImpl<ISD::InputArg> &Ins, - SmallVectorImpl<ISD::InputArg> &OrigIns) const { - - for (unsigned i = 0, e = Ins.size(); i < e; ++i) { - if (Ins[i].ArgVT == Ins[i].VT) { - OrigIns.push_back(Ins[i]); - continue; - } - - EVT VT; - if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { - // Vector has been split into scalars. - VT = Ins[i].ArgVT.getVectorElementType(); - } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && - Ins[i].ArgVT.getVectorElementType() != - Ins[i].VT.getVectorElementType()) { - // Vector elements have been promoted - VT = Ins[i].ArgVT; - } else { - // Vector has been spilt into smaller vectors. - VT = Ins[i].VT; - } - - ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, - Ins[i].OrigArgIndex, Ins[i].PartOffset); - OrigIns.push_back(Arg); - } -} - SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { @@ -2754,7 +3184,8 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { - uint64_t ArgOffset = MFI->ABIArgOffset; + unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr(); + uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment); switch (Param) { case GRID_DIM: return ArgOffset; @@ -2779,6 +3210,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RETURN) NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) + NODE_NAME_CASE(SETCC) + NODE_NAME_CASE(SETREG) + NODE_NAME_CASE(FMA_W_CHAIN) + NODE_NAME_CASE(FMUL_W_CHAIN) NODE_NAME_CASE(CLAMP) NODE_NAME_CASE(COS_HW) NODE_NAME_CASE(SIN_HW) @@ -2800,7 +3235,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TRIG_PREOP) NODE_NAME_CASE(RCP) NODE_NAME_CASE(RSQ) + NODE_NAME_CASE(RCP_LEGACY) NODE_NAME_CASE(RSQ_LEGACY) + NODE_NAME_CASE(FMUL_LEGACY) NODE_NAME_CASE(RSQ_CLAMP) NODE_NAME_CASE(LDEXP) NODE_NAME_CASE(FP_CLASS) @@ -2812,12 +3249,19 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFI) NODE_NAME_CASE(BFM) NODE_NAME_CASE(FFBH_U32) + NODE_NAME_CASE(FFBH_I32) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) + NODE_NAME_CASE(MULHI_U24) + NODE_NAME_CASE(MULHI_I24) + NODE_NAME_CASE(MUL_LOHI_U24) + NODE_NAME_CASE(MUL_LOHI_I24) NODE_NAME_CASE(MAD_U24) NODE_NAME_CASE(MAD_I24) NODE_NAME_CASE(TEXTURE_FETCH) NODE_NAME_CASE(EXPORT) + NODE_NAME_CASE(EXPORT_DONE) + NODE_NAME_CASE(R600_EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) NODE_NAME_CASE(REGISTER_STORE) @@ -2833,8 +3277,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) + NODE_NAME_CASE(KILL) + NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(SENDMSG) + NODE_NAME_CASE(SENDMSGHALT) NODE_NAME_CASE(INTERP_MOV) NODE_NAME_CASE(INTERP_P1) NODE_NAME_CASE(INTERP_P2) @@ -2844,16 +3291,18 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) + NODE_NAME_CASE(BUFFER_LOAD) + NODE_NAME_CASE(BUFFER_LOAD_FORMAT) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; } -SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps, - bool &UseOneConstNR) const { - SelectionDAG &DAG = DCI.DAG; +SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, + SelectionDAG &DAG, int Enabled, + int &RefinementSteps, + bool &UseOneConstNR, + bool Reciprocal) const { EVT VT = Operand.getValueType(); if (VT == MVT::f32) { @@ -2868,9 +3317,8 @@ SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, } SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const { - SelectionDAG &DAG = DCI.DAG; + SelectionDAG &DAG, int Enabled, + int &RefinementSteps) const { EVT VT = Operand.getValueType(); if (VT == MVT::f32) { diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index c2c7585..f6adcea 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -25,19 +25,19 @@ class AMDGPUSubtarget; class MachineRegisterInfo; class AMDGPUTargetLowering : public TargetLowering { +private: + /// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been + /// legalized from a smaller type VT. Need to match pre-legalized type because + /// the generic legalization inserts the add/sub between the select and + /// compare. + SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const; + protected: const AMDGPUSubtarget *Subtarget; - SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV, - const SDValue &InitPtr, - SDValue Chain, - SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; - /// \brief Lower vector stores by merging the vector elements into an integer - /// of the same bitwidth. - SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const; /// \brief Split a vector store into multiple scalar stores. /// \returns The resulting chain. @@ -60,6 +60,7 @@ protected: SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; @@ -69,17 +70,23 @@ protected: bool shouldCombineMemoryType(EVT VT) const; SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, + unsigned Opc, SDValue LHS, + uint32_t ValLo, uint32_t ValHi) const; SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); - static EVT getEquivalentBitType(LLVMContext &Context, EVT VT); virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const; @@ -102,16 +109,8 @@ protected: SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results) const; - /// The SelectionDAGBuilder will automatically promote function arguments - /// with illegal types. However, this does not work for the AMDGPU targets - /// since the function arguments are stored in memory as these illegal types. - /// In order to handle this properly we need to get the origianl types sizes - /// from the LLVM IR Function and fixup the ISD:InputArg values before - /// passing them to AnalyzeFormalArguments() - void getOriginalFunctionArgs(SelectionDAG &DAG, - const Function *F, - const SmallVectorImpl<ISD::InputArg> &Ins, - SmallVectorImpl<ISD::InputArg> &OrigIns) const; + void analyzeFormalArgumentsCompute(CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const; void AnalyzeFormalArguments(CCState &State, const SmallVectorImpl<ISD::InputArg> &Ins) const; void AnalyzeReturn(CCState &State, @@ -120,6 +119,16 @@ protected: public: AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); + bool mayIgnoreSignedZero(SDValue Op) const { + if (getTargetMachine().Options.UnsafeFPMath) // FIXME: nsz only + return true; + + if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(Op)) + return BO->Flags.hasNoSignedZeros(); + + return false; + } + bool isFAbsFree(EVT VT) const override; bool isFNegFree(EVT VT) const override; bool isTruncateFree(EVT Src, EVT Dest) const override; @@ -171,13 +180,14 @@ public: const char* getTargetNodeName(unsigned Opcode) const override; - SDValue getRsqrtEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps, - bool &UseOneConstNR) const override; - SDValue getRecipEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const override; + bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { + return true; + } + SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + int &RefinementSteps, bool &UseOneConstNR, + bool Reciprocal) const override; + SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + int &RefinementSteps) const override; virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const = 0; @@ -228,6 +238,13 @@ enum NodeType : unsigned { DWORDADDR, FRACT, CLAMP, + // This is SETCC with the full mask result which is used for a compare with a + // result bit per item in the wavefront. + SETCC, + SETREG, + // FP ops with input and output chain. + FMA_W_CHAIN, + FMUL_W_CHAIN, // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. // Denormals handled on some parts. @@ -254,7 +271,9 @@ enum NodeType : unsigned { // For f64, max error 2^29 ULP, handles denormals. RCP, RSQ, + RCP_LEGACY, RSQ_LEGACY, + FMUL_LEGACY, RSQ_CLAMP, LDEXP, FP_CLASS, @@ -266,12 +285,19 @@ enum NodeType : unsigned { BFI, // (src0 & src1) | (~src0 & src2) BFM, // Insert a range of bits into a 32-bit word. FFBH_U32, // ctlz with -1 if input is zero. + FFBH_I32, MUL_U24, MUL_I24, + MULHI_U24, + MULHI_I24, MAD_U24, MAD_I24, + MUL_LOHI_I24, + MUL_LOHI_U24, TEXTURE_FETCH, - EXPORT, + EXPORT, // exp on SI+ + EXPORT_DONE, // exp on SI+ with done bit set + R600_EXPORT, CONST_ADDRESS, REGISTER_LOAD, REGISTER_STORE, @@ -298,10 +324,13 @@ enum NodeType : unsigned { /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, SENDMSG, + SENDMSGHALT, INTERP_MOV, INTERP_P1, INTERP_P2, PC_ADD_REL_OFFSET, + KILL, + DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, STORE_MSKOR, LOAD_CONSTANT, @@ -309,6 +338,8 @@ enum NodeType : unsigned { ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, + BUFFER_LOAD, + BUFFER_LOAD_FORMAT, LAST_AMDGPU_ISD_NUMBER }; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 9a00ecb..e4dc659 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -23,7 +23,6 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR -#define GET_INSTRINFO_NAMED_OPS #define GET_INSTRMAP_INFO #include "AMDGPUGenInstrInfo.inc" @@ -33,10 +32,6 @@ void AMDGPUInstrInfo::anchor() {} AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST) : AMDGPUGenInstrInfo(-1, -1), ST(ST) {} -bool AMDGPUInstrInfo::enableClusterLoads() const { - return true; -} - // FIXME: This behaves strangely. If, for example, you have 32 load + stores, // the first 16 loads will be interleaved with the stores, and the next 16 will // be clustered as expected. It should really split into 2 16 store batches. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index a59eafa..bd8e389 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -17,17 +17,12 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H #include "llvm/Target/TargetInstrInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #define GET_INSTRINFO_HEADER #define GET_INSTRINFO_ENUM -#define GET_INSTRINFO_OPERAND_ENUM #include "AMDGPUGenInstrInfo.inc" -#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT -#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT -#define OPCODE_IS_ZERO AMDGPU::PRED_SETE -#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE - namespace llvm { class AMDGPUSubtarget; @@ -44,8 +39,6 @@ private: public: explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); - bool enableClusterLoads() const override; - bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; @@ -59,15 +52,6 @@ public: /// equivalent opcode that writes \p Channels Channels. int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const; }; - -namespace AMDGPU { - LLVM_READONLY - int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); -} // End namespace AMDGPU - } // End llvm namespace -#define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63) -#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62) - #endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 2b13bb9..d7fa28b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -40,6 +40,8 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>] >; +def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -52,6 +54,9 @@ def AMDGPUconstdata_ptr : SDNode< // This argument to this node is a dword address. def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; +// Force dependencies for vector trunc stores +def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>; + def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; @@ -65,6 +70,7 @@ def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) +def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>; def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) result clamped to +/- max_float. @@ -82,6 +88,10 @@ def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, [] >; +def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative] +>; + def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; // out = max(a, b) a and b are signed ints @@ -137,6 +147,24 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; // out = (src1 > src0) ? 1 : 0 def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>; +def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc + SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> +]>; + +def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>; + +def AMDGPUSetRegOp : SDTypeProfile<0, 2, [ + SDTCisInt<0>, SDTCisInt<1> +]>; + +def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [ + SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", SDTIntToFPOp, []>; @@ -202,14 +230,22 @@ def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>; +def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>; -// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when -// performing the mulitply. The result is a 32-bit value. +// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore +// when performing the mulitply. The result is a 32-bit value. def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, - [SDNPCommutative] + [SDNPCommutative, SDNPAssociative] >; def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp, - [SDNPCommutative] + [SDNPCommutative, SDNPAssociative] +>; + +def AMDGPUmulhi_u24 : SDNode<"AMDGPUISD::MULHI_U24", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; +def AMDGPUmulhi_i24 : SDNode<"AMDGPUISD::MULHI_I24", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] >; def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp, @@ -233,6 +269,10 @@ def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPInGlue]>; +def AMDGPUsendmsghalt : SDNode<"AMDGPUISD::SENDMSGHALT", + SDTypeProfile<0, 1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPInGlue]>; + def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV", SDTypeProfile<1, 3, [SDTCisFP<0>]>, [SDNPInGlue]>; @@ -245,6 +285,35 @@ def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2", SDTypeProfile<1, 4, [SDTCisFP<0>]>, [SDNPInGlue]>; + +def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT, + [SDNPHasChain, SDNPSideEffect]>; + +// SI+ export +def AMDGPUExportOp : SDTypeProfile<0, 8, [ + SDTCisInt<0>, // i8 en + SDTCisInt<1>, // i1 vm + // skip done + SDTCisInt<2>, // i8 tgt + SDTCisSameAs<3, 1>, // i1 compr + SDTCisFP<4>, // f32 src0 + SDTCisSameAs<5, 4>, // f32 src1 + SDTCisSameAs<6, 4>, // f32 src2 + SDTCisSameAs<7, 4> // f32 src3 +]>; + +def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp, + [SDNPHasChain, SDNPMayStore]>; + +def AMDGPUexport_done: SDNode<"AMDGPUISD::EXPORT_DONE", AMDGPUExportOp, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; + + +def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>; + +def R600_EXPORT: SDNode<"AMDGPUISD::R600_EXPORT", R600ExportOp, + [SDNPHasChain, SDNPSideEffect]>; + //===----------------------------------------------------------------------===// // Flow Control Profile Types //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 3944fdb..59cba63 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -42,6 +42,7 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm = "", field bits<32> Inst = 0xffffffff; } +def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">; def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; @@ -49,13 +50,6 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; -// 32-bit VALU immediate operand that uses the constant bus. -def u32kimm : Operand<i32> { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_KIMM32"; - let PrintMethod = "printU32ImmOperand"; -} - let OperandType = "OPERAND_IMMEDIATE" in { def u32imm : Operand<i32> { @@ -172,6 +166,12 @@ class HasOneUseBinOp<SDPatternOperator op> : PatFrag< [{ return N->hasOneUse(); }] >; +class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag< + (ops node:$src0, node:$src1, node:$src2), + (op $src0, $src1, $src2), + [{ return N->hasOneUse(); }] +>; + //===----------------------------------------------------------------------===// // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// @@ -363,53 +363,54 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>; -def mskor_flat : PatFrag<(ops node:$val, node:$ptr), - (AMDGPUstore_mskor node:$val, node:$ptr), [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; -}]>; +multiclass global_binary_atomic_op<SDNode atomic_op> { + def "" : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + + def _noret : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; + + def _ret : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; +} -class global_binary_atomic_op<SDNode atomic_op> : PatFrag< - (ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}] ->; - -class flat_binary_atomic_op<SDNode atomic_op> : PatFrag< - (ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}] ->; - -def atomic_swap_global : global_binary_atomic_op<atomic_swap>; -def atomic_add_global : global_binary_atomic_op<atomic_load_add>; -def atomic_and_global : global_binary_atomic_op<atomic_load_and>; -def atomic_max_global : global_binary_atomic_op<atomic_load_max>; -def atomic_min_global : global_binary_atomic_op<atomic_load_min>; -def atomic_or_global : global_binary_atomic_op<atomic_load_or>; -def atomic_sub_global : global_binary_atomic_op<atomic_load_sub>; -def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>; -def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>; -def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; - -def atomic_cmp_swap_global : global_binary_atomic_op<AMDGPUatomic_cmp_swap>; -def atomic_cmp_swap_global_nortn : PatFrag< - (ops node:$ptr, node:$value), - (atomic_cmp_swap_global node:$ptr, node:$value), - [{ return SDValue(N, 0).use_empty(); }] ->; - -def atomic_swap_flat : flat_binary_atomic_op<atomic_swap>; -def atomic_add_flat : flat_binary_atomic_op<atomic_load_add>; -def atomic_and_flat : flat_binary_atomic_op<atomic_load_and>; -def atomic_max_flat : flat_binary_atomic_op<atomic_load_max>; -def atomic_min_flat : flat_binary_atomic_op<atomic_load_min>; -def atomic_or_flat : flat_binary_atomic_op<atomic_load_or>; -def atomic_sub_flat : flat_binary_atomic_op<atomic_load_sub>; -def atomic_umax_flat : flat_binary_atomic_op<atomic_load_umax>; -def atomic_umin_flat : flat_binary_atomic_op<atomic_load_umin>; -def atomic_xor_flat : flat_binary_atomic_op<atomic_load_xor>; - -def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>; +defm atomic_swap_global : global_binary_atomic_op<atomic_swap>; +defm atomic_add_global : global_binary_atomic_op<atomic_load_add>; +defm atomic_and_global : global_binary_atomic_op<atomic_load_and>; +defm atomic_max_global : global_binary_atomic_op<atomic_load_max>; +defm atomic_min_global : global_binary_atomic_op<atomic_load_min>; +defm atomic_or_global : global_binary_atomic_op<atomic_load_or>; +defm atomic_sub_global : global_binary_atomic_op<atomic_load_sub>; +defm atomic_umax_global : global_binary_atomic_op<atomic_load_umax>; +defm atomic_umin_global : global_binary_atomic_op<atomic_load_umin>; +defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; + +//legacy +def AMDGPUatomic_cmp_swap_global : PatFrag< + (ops node:$ptr, node:$value), + (AMDGPUatomic_cmp_swap node:$ptr, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + +def atomic_cmp_swap_global : PatFrag< + (ops node:$ptr, node:$cmp, node:$value), + (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + +def atomic_cmp_swap_global_noret : PatFrag< + (ops node:$ptr, node:$cmp, node:$value), + (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; + +def atomic_cmp_swap_global_ret : PatFrag< + (ops node:$ptr, node:$cmp, node:$value), + (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; //===----------------------------------------------------------------------===// // Misc Pattern Fragments @@ -420,6 +421,7 @@ int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding +int FP16_ONE = 0x3C00; int FP32_ONE = 0x3f800000; int FP32_NEG_ONE = 0xbf800000; int FP64_ONE = 0x3ff0000000000000; @@ -559,17 +561,26 @@ multiclass BFIPatterns <Instruction BFI_INT, def : Pat < (fcopysign f32:$src0, f32:$src1), - (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1) + (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1) >; def : Pat < (f64 (fcopysign f64:$src0, f64:$src1)), (REG_SEQUENCE RC64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, - (BFI_INT (LoadImm32 0x7fffffff), + (BFI_INT (LoadImm32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) >; + + def : Pat < + (f64 (fcopysign f64:$src0, f32:$src1)), + (REG_SEQUENCE RC64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, + (BFI_INT (LoadImm32 (i32 0x7fffffff)), + (i32 (EXTRACT_SUBREG $src0, sub1)), + $src1), sub1) + >; } // SHA-256 Ma patterns @@ -620,9 +631,9 @@ def umax_oneuse : HasOneUseBinOp<umax>; def umin_oneuse : HasOneUseBinOp<umin>; } // Properties = [SDNPCommutative, SDNPAssociative] +def sub_oneuse : HasOneUseBinOp<sub>; -// 24-bit arithmetic patterns -def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>; +def select_oneuse : HasOneUseTernaryOp<select>; // Special conversion patterns diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td index 2127391..ceae0b5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -16,6 +16,8 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; def int_AMDGPU_kilp : Intrinsic<[], [], []>; + + // Deprecated in favor of llvm.amdgcn.sffbh def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; // Deprecated in favor of separate int_amdgcn_cube* intrinsics. @@ -29,9 +31,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_rsq : Intrinsic< [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] >; - - // Deprecated in favor of llvm.amdgcn.read.workdim - def int_AMDGPU_read_workdim : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>; } include "SIIntrinsics.td" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index ad8d3e4..7d56355 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -36,13 +36,92 @@ using namespace llvm; -AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st): - Ctx(ctx), ST(st) { } +#include "AMDGPUGenMCPseudoLowering.inc" + + +AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st, + const AsmPrinter &ap): + Ctx(ctx), ST(st), AP(ap) { } static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) { switch (MOFlags) { - default: return MCSymbolRefExpr::VK_None; - case SIInstrInfo::MO_GOTPCREL: return MCSymbolRefExpr::VK_GOTPCREL; + default: + return MCSymbolRefExpr::VK_None; + case SIInstrInfo::MO_GOTPCREL: + return MCSymbolRefExpr::VK_GOTPCREL; + case SIInstrInfo::MO_GOTPCREL32_LO: + return MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_LO; + case SIInstrInfo::MO_GOTPCREL32_HI: + return MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_HI; + case SIInstrInfo::MO_REL32_LO: + return MCSymbolRefExpr::VK_AMDGPU_REL32_LO; + case SIInstrInfo::MO_REL32_HI: + return MCSymbolRefExpr::VK_AMDGPU_REL32_HI; + } +} + +const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr( + const MachineBasicBlock &SrcBB, + const MachineOperand &MO) const { + const MCExpr *DestBBSym + = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx); + const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx); + + assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 && + ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4); + + // s_getpc_b64 returns the address of next instruction. + const MCConstantExpr *One = MCConstantExpr::create(4, Ctx); + SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx); + + if (MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_FORWARD) + return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx); + + assert(MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_BACKWARD); + return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx); +} + +bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, + MCOperand &MCOp) const { + switch (MO.getType()) { + default: + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Immediate: + MCOp = MCOperand::createImm(MO.getImm()); + return true; + case MachineOperand::MO_Register: + MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST)); + return true; + case MachineOperand::MO_MachineBasicBlock: { + if (MO.getTargetFlags() != 0) { + MCOp = MCOperand::createExpr( + getLongBranchBlockExpr(*MO.getParent()->getParent(), MO)); + } else { + MCOp = MCOperand::createExpr( + MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx)); + } + + return true; + } + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + SmallString<128> SymbolName; + AP.getNameWithPrefix(SymbolName, GV); + MCSymbol *Sym = Ctx.getOrCreateSymbol(SymbolName); + const MCExpr *SymExpr = + MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx); + const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr, + MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); + MCOp = MCOperand::createExpr(Expr); + return true; + } + case MachineOperand::MO_ExternalSymbol: { + MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); + Sym->setExternal(true); + const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); + MCOp = MCOperand::createExpr(Expr); + return true; + } } } @@ -60,44 +139,24 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { for (const MachineOperand &MO : MI->explicit_operands()) { MCOperand MCOp; - switch (MO.getType()) { - default: - llvm_unreachable("unknown operand type"); - case MachineOperand::MO_Immediate: - MCOp = MCOperand::createImm(MO.getImm()); - break; - case MachineOperand::MO_Register: - MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST)); - break; - case MachineOperand::MO_MachineBasicBlock: - MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( - MO.getMBB()->getSymbol(), Ctx)); - break; - case MachineOperand::MO_GlobalAddress: { - const GlobalValue *GV = MO.getGlobal(); - MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName())); - const MCExpr *SymExpr = - MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx); - const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr, - MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); - MCOp = MCOperand::createExpr(Expr); - break; - } - case MachineOperand::MO_ExternalSymbol: { - MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); - Sym->setExternal(true); - const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); - MCOp = MCOperand::createExpr(Expr); - break; - } - } + lowerOperand(MO, MCOp); OutMI.addOperand(MCOp); } } +bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO, + MCOperand &MCOp) const { + const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>(); + AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this); + return MCInstLowering.lowerOperand(MO, MCOp); +} + void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { + if (emitPseudoExpansionLowering(*OutStreamer, MI)) + return; + const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>(); - AMDGPUMCInstLower MCInstLowering(OutContext, STI); + AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this); StringRef Err; if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) { @@ -137,6 +196,12 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } + if (MI->getOpcode() == AMDGPU::WAVE_BARRIER) { + if (isVerbose()) + OutStreamer->emitRawComment(" wave barrier"); + return; + } + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h index 957dcd0..57d2d85 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -5,7 +5,6 @@ // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -/// \file //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H @@ -14,16 +13,28 @@ namespace llvm { class AMDGPUSubtarget; +class AsmPrinter; +class MachineBasicBlock; class MachineInstr; +class MachineOperand; class MCContext; +class MCExpr; class MCInst; +class MCOperand; class AMDGPUMCInstLower { MCContext &Ctx; const AMDGPUSubtarget &ST; + const AsmPrinter &AP; + + const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB, + const MachineOperand &MO) const; public: - AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST); + AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST, + const AsmPrinter &AP); + + bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const; /// \brief Lower a MachineInstr to an MCInst void lower(const MachineInstr *MI, MCInst &OutMI) const; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 44516da..40c3327 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -1,23 +1,47 @@ +//===-- AMDGPUMachineFunctionInfo.cpp ---------------------------------------=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + #include "AMDGPUMachineFunction.h" +#include "AMDGPUSubtarget.h" using namespace llvm; -// Pin the vtable to this file. -void AMDGPUMachineFunction::anchor() {} - AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), + LocalMemoryObjects(), KernArgSize(0), MaxKernArgAlign(0), LDSSize(0), ABIArgOffset(0), - ScratchSize(0), - IsKernel(MF.getFunction()->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL || - MF.getFunction()->getCallingConv() == llvm::CallingConv::SPIR_KERNEL) -{ + IsKernel(MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_KERNEL || + MF.getFunction()->getCallingConv() == CallingConv::SPIR_KERNEL) { + // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, + // except reserved size is not correctly aligned. } -bool AMDGPUMachineFunction::isKernel() const -{ - return IsKernel; +unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, + const GlobalValue &GV) { + auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0)); + if (!Entry.second) + return Entry.first->second; + + unsigned Align = GV.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(GV.getValueType()); + + /// TODO: We should sort these to minimize wasted space due to alignment + /// padding. Currently the padding is decided by the first encountered use + /// during lowering. + unsigned Offset = LDSSize = alignTo(LDSSize, Align); + + Entry.first->second = Offset; + LDSSize += DL.getTypeAllocSize(GV.getValueType()); + + return Offset; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 6b31f63..5d0640b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -11,15 +11,26 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H #include "llvm/CodeGen/MachineFunction.h" -#include <map> +#include "llvm/ADT/DenseMap.h" namespace llvm { class AMDGPUMachineFunction : public MachineFunctionInfo { + /// A map to keep track of local memory objects and their offsets within the + /// local memory space. + SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects; + uint64_t KernArgSize; unsigned MaxKernArgAlign; - virtual void anchor(); + /// Number of bytes in the LDS that are being used. + unsigned LDSSize; + + // FIXME: This should probably be removed. + /// Start of implicit kernel args + unsigned ABIArgOffset; + + bool IsKernel; public: AMDGPUMachineFunction(const MachineFunction &MF); @@ -35,19 +46,31 @@ public: return Result; } - /// A map to keep track of local memory objects and their offsets within - /// the local memory space. - std::map<const GlobalValue *, unsigned> LocalMemoryObjects; - /// Number of bytes in the LDS that are being used. - unsigned LDSSize; + uint64_t getKernArgSize() const { + return KernArgSize; + } - /// Start of implicit kernel args - unsigned ABIArgOffset; + unsigned getMaxKernArgAlign() const { + return MaxKernArgAlign; + } - bool isKernel() const; + void setABIArgOffset(unsigned NewOffset) { + ABIArgOffset = NewOffset; + } - unsigned ScratchSize; - bool IsKernel; + unsigned getABIArgOffset() const { + return ABIArgOffset; + } + + unsigned getLDSSize() const { + return LDSSize; + } + + bool isKernel() const { + return IsKernel; + } + + unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV); }; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp index 8bc7b53..410bd52 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp @@ -358,7 +358,7 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass { return transformKernels(M); } - const char *getPassName() const override { + StringRef getPassName() const override { return "AMDGPU OpenCL Image Type Pass"; } }; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h new file mode 100644 index 0000000..947d45b --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h @@ -0,0 +1,42 @@ +//===-- AMDGPUNoteType.h - AMDGPU ELF PT_NOTE section info-------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// Enums and constants for AMDGPU PT_NOTE sections. +/// +// +//===----------------------------------------------------------------------===// +// +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H + +namespace AMDGPU { + +namespace PT_NOTE { + +const char SectionName[] = ".note"; + +const char NoteName[] = "AMD"; + +enum NoteType{ + NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1, + NT_AMDGPU_HSA_HSAIL = 2, + NT_AMDGPU_HSA_ISA = 3, + NT_AMDGPU_HSA_PRODUCER = 4, + NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5, + NT_AMDGPU_HSA_EXTENSION = 6, + NT_AMDGPU_HSA_RUNTIME_METADATA = 7, + NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101, + NT_AMDGPU_HSA_HLDEBUG_TARGET = 102 +}; +} +} + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUNOTETYPE_H diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 0bad63f..baa28de 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -76,9 +76,7 @@ public: bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; - const char *getPassName() const override { - return "AMDGPU Promote Alloca"; - } + StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } void handleAlloca(AllocaInst &I); @@ -184,13 +182,12 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { // TODO: Have some sort of hint or other heuristics to guess occupancy based // on other factors.. - unsigned OccupancyHint - = AMDGPU::getIntegerAttribute(F, "amdgpu-max-waves-per-eu", 0); + unsigned OccupancyHint = ST.getWavesPerEU(F).second; if (OccupancyHint == 0) OccupancyHint = 7; // Clamp to max value. - OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerCU()); + OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); // Check the hint but ignore it if it's obviously wrong from the existing LDS // usage. @@ -535,7 +532,7 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( std::vector<Value*> &WorkList) const { for (User *User : Val->users()) { - if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) + if (is_contained(WorkList, User)) continue; if (CallInst *CI = dyn_cast<CallInst>(User)) { @@ -550,7 +547,7 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( if (UseInst->getOpcode() == Instruction::PtrToInt) return false; - if (LoadInst *LI = dyn_cast_or_null<LoadInst>(UseInst)) { + if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) { if (LI->isVolatile()) return false; @@ -564,11 +561,10 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( // Reject if the stored value is not the pointer operand. if (SI->getPointerOperand() != Val) return false; - } else if (AtomicRMWInst *RMW = dyn_cast_or_null<AtomicRMWInst>(UseInst)) { + } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) { if (RMW->isVolatile()) return false; - } else if (AtomicCmpXchgInst *CAS - = dyn_cast_or_null<AtomicCmpXchgInst>(UseInst)) { + } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) { if (CAS->isVolatile()) return false; } @@ -583,6 +579,12 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( WorkList.push_back(ICmp); } + if (UseInst->getOpcode() == Instruction::AddrSpaceCast) { + // Don't collect the users of this. + WorkList.push_back(User); + continue; + } + if (!User->getType()->isPointerTy()) continue; @@ -651,9 +653,11 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (AMDGPU::isShader(ContainingFunction.getCallingConv())) return; + const AMDGPUSubtarget &ST = + TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction); // FIXME: We should also try to get this value from the reqd_work_group_size // function attribute if it is available. - unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction); + unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; const DataLayout &DL = Mod->getDataLayout(); @@ -741,7 +745,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { continue; } - // The operand's value should be corrected on its own. + // The operand's value should be corrected on its own and we don't want to + // touch the users. if (isa<AddrSpaceCastInst>(V)) continue; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h index 40f6394..ecd2ac7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h @@ -13,18 +13,13 @@ /// /// Runtime requests certain information (metadata) about kernels to be able /// to execute the kernels and answer the queries about the kernels. -/// The metadata is represented as a byte stream in an ELF section of a -/// binary (code object). The byte stream consists of key-value pairs. -/// Each key is an 8 bit unsigned integer. Each value can be an integer, -/// a string, or a stream of key-value pairs. There are 3 levels of key-value -/// pair streams. At the beginning of the ELF section is the top level -/// key-value pair stream. A kernel-level key-value pair stream starts after -/// encountering KeyKernelBegin and ends immediately before encountering -/// KeyKernelEnd. A kernel-argument-level key-value pair stream starts -/// after encountering KeyArgBegin and ends immediately before encountering -/// KeyArgEnd. A kernel-level key-value pair stream can only appear in a top -/// level key-value pair stream. A kernel-argument-level key-value pair stream -/// can only appear in a kernel-level key-value pair stream. +/// The metadata is represented as a note element in the .note ELF section of a +/// binary (code object). The desc field of the note element is a YAML string +/// consisting of key-value pairs. Each key is a string. Each value can be +/// an integer, a string, or an YAML sequence. There are 3 levels of YAML maps. +/// At the beginning of the YAML string is the module level YAML map. A +/// kernel-level YAML map is in the amd.Kernels sequence. A +/// kernel-argument-level map is in the amd.Args sequence. /// /// The format should be kept backward compatible. New enum values and bit /// fields should be appended at the end. It is suggested to bump up the @@ -37,77 +32,64 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H -#include <stdint.h> +#include <cstdint> +#include <vector> +#include <string> namespace AMDGPU { namespace RuntimeMD { // Version and revision of runtime metadata - const unsigned char MDVersion = 1; + const unsigned char MDVersion = 2; const unsigned char MDRevision = 0; - // ELF section name containing runtime metadata - const char SectionName[] = ".AMDGPU.runtime_metadata"; - - // Enumeration values of keys in runtime metadata. - enum Key { - KeyNull = 0, // Place holder. Ignored when encountered - KeyMDVersion = 1, // Runtime metadata version - KeyLanguage = 2, // Language - KeyLanguageVersion = 3, // Language version - KeyKernelBegin = 4, // Beginning of kernel-level stream - KeyKernelEnd = 5, // End of kernel-level stream - KeyKernelName = 6, // Kernel name - KeyArgBegin = 7, // Beginning of kernel-arg-level stream - KeyArgEnd = 8, // End of kernel-arg-level stream - KeyArgSize = 9, // Kernel arg size - KeyArgAlign = 10, // Kernel arg alignment - KeyArgTypeName = 11, // Kernel type name - KeyArgName = 12, // Kernel name - KeyArgTypeKind = 13, // Kernel argument type kind - KeyArgValueType = 14, // Kernel argument value type - KeyArgAddrQual = 15, // Kernel argument address qualifier - KeyArgAccQual = 16, // Kernel argument access qualifier - KeyArgIsConst = 17, // Kernel argument is const qualified - KeyArgIsRestrict = 18, // Kernel argument is restrict qualified - KeyArgIsVolatile = 19, // Kernel argument is volatile qualified - KeyArgIsPipe = 20, // Kernel argument is pipe qualified - KeyReqdWorkGroupSize = 21, // Required work group size - KeyWorkGroupSizeHint = 22, // Work group size hint - KeyVecTypeHint = 23, // Vector type hint - KeyKernelIndex = 24, // Kernel index for device enqueue - KeySGPRs = 25, // Number of SGPRs - KeyVGPRs = 26, // Number of VGPRs - KeyMinWavesPerSIMD = 27, // Minimum number of waves per SIMD - KeyMaxWavesPerSIMD = 28, // Maximum number of waves per SIMD - KeyFlatWorkGroupSizeLimits = 29, // Flat work group size limits - KeyMaxWorkGroupSize = 30, // Maximum work group size - KeyNoPartialWorkGroups = 31, // No partial work groups - }; - - enum Language : uint8_t { - OpenCL_C = 0, - HCC = 1, - OpenMP = 2, - OpenCL_CPP = 3, -}; - - enum LanguageVersion : uint16_t { - V100 = 100, - V110 = 110, - V120 = 120, - V200 = 200, - V210 = 210, - }; + // Name of keys for runtime metadata. + namespace KeyName { + const char MDVersion[] = "amd.MDVersion"; // Runtime metadata version + const char Language[] = "amd.Language"; // Language + const char LanguageVersion[] = "amd.LanguageVersion"; // Language version + const char Kernels[] = "amd.Kernels"; // Kernels + const char KernelName[] = "amd.KernelName"; // Kernel name + const char Args[] = "amd.Args"; // Kernel arguments + const char ArgSize[] = "amd.ArgSize"; // Kernel arg size + const char ArgAlign[] = "amd.ArgAlign"; // Kernel arg alignment + const char ArgTypeName[] = "amd.ArgTypeName"; // Kernel type name + const char ArgName[] = "amd.ArgName"; // Kernel name + const char ArgKind[] = "amd.ArgKind"; // Kernel argument kind + const char ArgValueType[] = "amd.ArgValueType"; // Kernel argument value type + const char ArgAddrQual[] = "amd.ArgAddrQual"; // Kernel argument address qualifier + const char ArgAccQual[] = "amd.ArgAccQual"; // Kernel argument access qualifier + const char ArgIsConst[] = "amd.ArgIsConst"; // Kernel argument is const qualified + const char ArgIsRestrict[] = "amd.ArgIsRestrict"; // Kernel argument is restrict qualified + const char ArgIsVolatile[] = "amd.ArgIsVolatile"; // Kernel argument is volatile qualified + const char ArgIsPipe[] = "amd.ArgIsPipe"; // Kernel argument is pipe qualified + const char ReqdWorkGroupSize[] = "amd.ReqdWorkGroupSize"; // Required work group size + const char WorkGroupSizeHint[] = "amd.WorkGroupSizeHint"; // Work group size hint + const char VecTypeHint[] = "amd.VecTypeHint"; // Vector type hint + const char KernelIndex[] = "amd.KernelIndex"; // Kernel index for device enqueue + const char NoPartialWorkGroups[] = "amd.NoPartialWorkGroups"; // No partial work groups + const char PrintfInfo[] = "amd.PrintfInfo"; // Prinf function call information + const char ArgActualAcc[] = "amd.ArgActualAcc"; // The actual kernel argument access qualifier + const char ArgPointeeAlign[] = "amd.ArgPointeeAlign"; // Alignment of pointee type + } namespace KernelArg { - enum TypeKind : uint8_t { - Value = 0, - Pointer = 1, - Image = 2, - Sampler = 3, - Queue = 4, + enum Kind : uint8_t { + ByValue = 0, + GlobalBuffer = 1, + DynamicSharedPointer = 2, + Sampler = 3, + Image = 4, + Pipe = 5, + Queue = 6, + HiddenGlobalOffsetX = 7, + HiddenGlobalOffsetY = 8, + HiddenGlobalOffsetZ = 9, + HiddenNone = 10, + HiddenPrintfBuffer = 11, + HiddenDefaultQueue = 12, + HiddenCompletionAction = 13, }; enum ValueType : uint16_t { @@ -125,13 +107,86 @@ namespace RuntimeMD { F64 = 11, }; + // Avoid using 'None' since it conflicts with a macro in X11 header file. enum AccessQualifer : uint8_t { - None = 0, + AccNone = 0, ReadOnly = 1, WriteOnly = 2, ReadWrite = 3, }; + + enum AddressSpaceQualifer : uint8_t { + Private = 0, + Global = 1, + Constant = 2, + Local = 3, + Generic = 4, + Region = 5, + }; } // namespace KernelArg + + // Invalid values are used to indicate an optional key should not be emitted. + const uint8_t INVALID_ADDR_QUAL = 0xff; + const uint8_t INVALID_ACC_QUAL = 0xff; + const uint32_t INVALID_KERNEL_INDEX = ~0U; + + namespace KernelArg { + // In-memory representation of kernel argument information. + struct Metadata { + uint32_t Size; + uint32_t Align; + uint32_t PointeeAlign; + uint8_t Kind; + uint16_t ValueType; + std::string TypeName; + std::string Name; + uint8_t AddrQual; + uint8_t AccQual; + uint8_t IsVolatile; + uint8_t IsConst; + uint8_t IsRestrict; + uint8_t IsPipe; + Metadata() : Size(0), Align(0), PointeeAlign(0), Kind(0), ValueType(0), + AddrQual(INVALID_ADDR_QUAL), AccQual(INVALID_ACC_QUAL), IsVolatile(0), + IsConst(0), IsRestrict(0), IsPipe(0) {} + }; + } + + namespace Kernel { + // In-memory representation of kernel information. + struct Metadata { + std::string Name; + std::string Language; + std::vector<uint8_t> LanguageVersion; + std::vector<uint32_t> ReqdWorkGroupSize; + std::vector<uint32_t> WorkGroupSizeHint; + std::string VecTypeHint; + uint32_t KernelIndex; + uint8_t NoPartialWorkGroups; + std::vector<KernelArg::Metadata> Args; + Metadata() : KernelIndex(INVALID_KERNEL_INDEX), NoPartialWorkGroups(0) {} + }; + } + + namespace Program { + // In-memory representation of program information. + struct Metadata { + std::vector<uint8_t> MDVersionSeq; + std::vector<std::string> PrintfInfo; + std::vector<Kernel::Metadata> Kernels; + + explicit Metadata(){} + + // Construct from an YAML string. + explicit Metadata(const std::string &YAML); + + // Convert to YAML string. + std::string toYAML(); + + // Convert from YAML string. + static Metadata fromYAML(const std::string &S); + }; + } } // namespace RuntimeMD } // namespace AMDGPU diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 10fa9cf..c35a67d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -13,14 +13,10 @@ //===----------------------------------------------------------------------===// #include "AMDGPUSubtarget.h" -#include "R600ISelLowering.h" -#include "R600InstrInfo.h" -#include "SIFrameLowering.h" -#include "SIISelLowering.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/Target/TargetFrameLowering.h" +#include <algorithm> using namespace llvm; @@ -31,7 +27,7 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" -AMDGPUSubtarget::~AMDGPUSubtarget() {} +AMDGPUSubtarget::~AMDGPUSubtarget() = default; AMDGPUSubtarget & AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, @@ -52,10 +48,18 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, ParseSubtargetFeatures(GPU, FullFS); + // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es + // on VI and newer hardware to avoid assertion failures due to missing ADDR64 + // variants of MUBUF instructions. + if (!hasAddr64() && !FS.contains("flat-for-global")) { + FlatForGlobal = true; + } + // FIXME: I don't think think Evergreen has any useful support for // denormals, but should be checked. Should we issue a warning somewhere // if someone tries to enable these? if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + FP16Denormals = false; FP32Denormals = false; FP64Denormals = false; } @@ -81,10 +85,12 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FastFMAF32(false), HalfRate64Ops(false), + FP16Denormals(false), FP32Denormals(false), FP64Denormals(false), FPExceptions(false), FlatForGlobal(false), + UnalignedScratchAccess(false), UnalignedBufferAccess(false), EnableXNACK(false), @@ -107,6 +113,10 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, SGPRInitBug(false), HasSMemRealTime(false), Has16BitInsts(false), + HasMovrel(false), + HasVGPRIndexMode(false), + HasScalarStores(false), + HasInv2PiInlineImm(false), FlatAddressSpace(false), R600ALUInst(false), @@ -114,6 +124,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, CFALUBug(false), HasVertexCache(false), TexVTXClauseSize(0), + ScalarizeGlobal(false), FeatureDisable(false), InstrItins(getInstrItineraryForCPU(GPU)) { @@ -178,6 +189,86 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { return 1; } +std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( + const Function &F) const { + // Default minimum/maximum flat work group sizes. + std::pair<unsigned, unsigned> Default = + AMDGPU::isCompute(F.getCallingConv()) ? + std::pair<unsigned, unsigned>(getWavefrontSize() * 2, + getWavefrontSize() * 4) : + std::pair<unsigned, unsigned>(1, getWavefrontSize()); + + // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa + // starts using "amdgpu-flat-work-group-size" attribute. + Default.second = AMDGPU::getIntegerAttribute( + F, "amdgpu-max-work-group-size", Default.second); + Default.first = std::min(Default.first, Default.second); + + // Requested minimum/maximum flat work group sizes. + std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( + F, "amdgpu-flat-work-group-size", Default); + + // Make sure requested minimum is less than requested maximum. + if (Requested.first > Requested.second) + return Default; + + // Make sure requested values do not violate subtarget's specifications. + if (Requested.first < getMinFlatWorkGroupSize()) + return Default; + if (Requested.second > getMaxFlatWorkGroupSize()) + return Default; + + return Requested; +} + +std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( + const Function &F) const { + // Default minimum/maximum number of waves per execution unit. + std::pair<unsigned, unsigned> Default(1, 0); + + // Default/requested minimum/maximum flat work group sizes. + std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); + + // If minimum/maximum flat work group sizes were explicitly requested using + // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum + // number of waves per execution unit to values implied by requested + // minimum/maximum flat work group sizes. + unsigned MinImpliedByFlatWorkGroupSize = + getMaxWavesPerEU(FlatWorkGroupSizes.second); + bool RequestedFlatWorkGroupSize = false; + + // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa + // starts using "amdgpu-flat-work-group-size" attribute. + if (F.hasFnAttribute("amdgpu-max-work-group-size") || + F.hasFnAttribute("amdgpu-flat-work-group-size")) { + Default.first = MinImpliedByFlatWorkGroupSize; + RequestedFlatWorkGroupSize = true; + } + + // Requested minimum/maximum number of waves per execution unit. + std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( + F, "amdgpu-waves-per-eu", Default, true); + + // Make sure requested minimum is less than requested maximum. + if (Requested.second && Requested.first > Requested.second) + return Default; + + // Make sure requested values do not violate subtarget's specifications. + if (Requested.first < getMinWavesPerEU() || + Requested.first > getMaxWavesPerEU()) + return Default; + if (Requested.second > getMaxWavesPerEU()) + return Default; + + // Make sure requested values are compatible with values implied by requested + // minimum/maximum flat work group sizes. + if (RequestedFlatWorkGroupSize && + Requested.first > MinImpliedByFlatWorkGroupSize) + return Default; + + return Requested; +} + R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) : AMDGPUSubtarget(TT, GPU, FS, TM), @@ -190,21 +281,7 @@ SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), - TLInfo(TM, *this), - GISel() {} - -unsigned R600Subtarget::getStackEntrySize() const { - switch (getWavefrontSize()) { - case 16: - return 8; - case 32: - return hasCaymanISA() ? 4 : 8; - case 64: - return 4; - default: - llvm_unreachable("Illegal wavefront size."); - } -} + TLInfo(TM, *this) {} void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const { @@ -227,15 +304,67 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); } -unsigned SISubtarget::getAmdKernelCodeChipID() const { - switch (getGeneration()) { - case SEA_ISLANDS: - return 12; - default: - llvm_unreachable("ChipID unknown"); +unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, + unsigned ExplicitArgBytes) const { + unsigned ImplicitBytes = getImplicitArgNumBytes(MF); + if (ImplicitBytes == 0) + return ExplicitArgBytes; + + unsigned Alignment = getAlignmentForImplicitArgPtr(); + return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; +} + +unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { + if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + if (SGPRs <= 80) + return 10; + if (SGPRs <= 88) + return 9; + if (SGPRs <= 100) + return 8; + return 7; } + if (SGPRs <= 48) + return 10; + if (SGPRs <= 56) + return 9; + if (SGPRs <= 64) + return 8; + if (SGPRs <= 72) + return 7; + if (SGPRs <= 80) + return 6; + return 5; } -AMDGPU::IsaVersion SISubtarget::getIsaVersion() const { - return AMDGPU::getIsaVersion(getFeatureBits()); +unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { + if (VGPRs <= 24) + return 10; + if (VGPRs <= 28) + return 9; + if (VGPRs <= 32) + return 8; + if (VGPRs <= 36) + return 7; + if (VGPRs <= 40) + return 6; + if (VGPRs <= 48) + return 5; + if (VGPRs <= 64) + return 4; + if (VGPRs <= 84) + return 3; + if (VGPRs <= 128) + return 2; + return 1; +} + +unsigned SISubtarget::getMaxNumSGPRs() const { + if (hasSGPRInitBug()) + return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + + if (getGeneration() >= VOLCANIC_ISLANDS) + return 102; + + return 104; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 3fe61aa..0e3cb7d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -23,15 +23,22 @@ #include "SIISelLowering.h" #include "SIFrameLowering.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/Triple.h" #include "llvm/CodeGen/GlobalISel/GISelAccessor.h" -#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Support/MathExtras.h" +#include <cassert> +#include <cstdint> +#include <memory> +#include <utility> #define GET_SUBTARGETINFO_HEADER #include "AMDGPUGenSubtargetInfo.inc" namespace llvm { -class SIMachineFunctionInfo; class StringRef; class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { @@ -50,9 +57,13 @@ public: ISAVersion0_0_0, ISAVersion7_0_0, ISAVersion7_0_1, + ISAVersion7_0_2, ISAVersion8_0_0, ISAVersion8_0_1, - ISAVersion8_0_3 + ISAVersion8_0_2, + ISAVersion8_0_3, + ISAVersion8_0_4, + ISAVersion8_1_0, }; protected: @@ -70,10 +81,12 @@ protected: bool HalfRate64Ops; // Dynamially set bits that enable features. + bool FP16Denormals; bool FP32Denormals; bool FP64Denormals; bool FPExceptions; bool FlatForGlobal; + bool UnalignedScratchAccess; bool UnalignedBufferAccess; bool EnableXNACK; bool DebuggerInsertNops; @@ -97,40 +110,60 @@ protected: bool SGPRInitBug; bool HasSMemRealTime; bool Has16BitInsts; + bool HasMovrel; + bool HasVGPRIndexMode; + bool HasScalarStores; + bool HasInv2PiInlineImm; bool FlatAddressSpace; bool R600ALUInst; bool CaymanISA; bool CFALUBug; bool HasVertexCache; short TexVTXClauseSize; + bool ScalarizeGlobal; // Dummy feature to use for assembler in tablegen. bool FeatureDisable; InstrItineraryData InstrItins; + SelectionDAGTargetInfo TSInfo; public: AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM); - virtual ~AMDGPUSubtarget(); + ~AMDGPUSubtarget() override; + AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS); - const AMDGPUInstrInfo *getInstrInfo() const override; - const AMDGPUFrameLowering *getFrameLowering() const override; - const AMDGPUTargetLowering *getTargetLowering() const override; - const AMDGPURegisterInfo *getRegisterInfo() const override; + const AMDGPUInstrInfo *getInstrInfo() const override = 0; + const AMDGPUFrameLowering *getFrameLowering() const override = 0; + const AMDGPUTargetLowering *getTargetLowering() const override = 0; + const AMDGPURegisterInfo *getRegisterInfo() const override = 0; const InstrItineraryData *getInstrItineraryData() const override { return &InstrItins; } + // Nothing implemented, just prevent crashes on use. + const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { + return &TSInfo; + } + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); bool isAmdHsaOS() const { return TargetTriple.getOS() == Triple::AMDHSA; } + bool isMesa3DOS() const { + return TargetTriple.getOS() == Triple::Mesa3D; + } + + bool isOpenCLEnv() const { + return TargetTriple.getEnvironment() == Triple::OpenCL; + } + Generation getGeneration() const { return Gen; } @@ -151,6 +184,10 @@ public: return MaxPrivateElementSize; } + bool has16BitInsts() const { + return Has16BitInsts; + } + bool hasHWFP64() const { return FP64; } @@ -230,6 +267,10 @@ public: return DumpCode; } + bool enableIEEEBit(const MachineFunction &MF) const { + return AMDGPU::isCompute(MF.getFunction()->getCallingConv()); + } + /// Return the amount of LDS that can be used that will not restrict the /// occupancy lower than WaveCount. unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const; @@ -238,6 +279,9 @@ public: /// the given LDS memory size is the only constraint. unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const; + bool hasFP16Denormals() const { + return FP16Denormals; + } bool hasFP32Denormals() const { return FP32Denormals; @@ -259,22 +303,43 @@ public: return UnalignedBufferAccess; } + bool hasUnalignedScratchAccess() const { + return UnalignedScratchAccess; + } + bool isXNACKEnabled() const { return EnableXNACK; } - unsigned getMaxWavesPerCU() const { - if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 10; + bool isMesaKernel(const MachineFunction &MF) const { + return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv()); + } - // FIXME: Not sure what this is for other subtagets. - return 8; + // Covers VS/PS/CS graphics shaders + bool isMesaGfxShader(const MachineFunction &MF) const { + return isMesa3DOS() && AMDGPU::isShader(MF.getFunction()->getCallingConv()); + } + + bool isAmdCodeObjectV2(const MachineFunction &MF) const { + return isAmdHsaOS() || isMesaKernel(MF); } /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. - unsigned getExplicitKernelArgOffset() const { - return isAmdHsaOS() ? 0 : 36; + unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { + return isAmdCodeObjectV2(MF) ? 0 : 36; + } + + unsigned getAlignmentForImplicitArgPtr() const { + return isAmdHsaOS() ? 8 : 4; + } + + unsigned getImplicitArgNumBytes(const MachineFunction &MF) const { + if (isMesaKernel(MF)) + return 16; + if (isAmdHsaOS() && isOpenCLEnv()) + return 32; + return 0; } unsigned getStackAlignment() const { @@ -289,6 +354,92 @@ public: bool enableSubRegLiveness() const override { return true; } + + /// \returns Number of execution units per compute unit supported by the + /// subtarget. + unsigned getEUsPerCU() const { + return 4; + } + + /// \returns Maximum number of work groups per compute unit supported by the + /// subtarget and limited by given flat work group size. + unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const { + if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 8; + return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16; + } + + /// \returns Maximum number of waves per compute unit supported by the + /// subtarget without any kind of limitation. + unsigned getMaxWavesPerCU() const { + return getMaxWavesPerEU() * getEUsPerCU(); + } + + /// \returns Maximum number of waves per compute unit supported by the + /// subtarget and limited by given flat work group size. + unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { + return getWavesPerWorkGroup(FlatWorkGroupSize); + } + + /// \returns Minimum number of waves per execution unit supported by the + /// subtarget. + unsigned getMinWavesPerEU() const { + return 1; + } + + /// \returns Maximum number of waves per execution unit supported by the + /// subtarget without any kind of limitation. + unsigned getMaxWavesPerEU() const { + if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 8; + // FIXME: Need to take scratch memory into account. + return 10; + } + + /// \returns Maximum number of waves per execution unit supported by the + /// subtarget and limited by given flat work group size. + unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const { + return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) / + getEUsPerCU(); + } + + /// \returns Minimum flat work group size supported by the subtarget. + unsigned getMinFlatWorkGroupSize() const { + return 1; + } + + /// \returns Maximum flat work group size supported by the subtarget. + unsigned getMaxFlatWorkGroupSize() const { + return 2048; + } + + /// \returns Number of waves per work group given the flat work group size. + unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { + return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize(); + } + + void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} + bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} + + /// \returns Subtarget's default pair of minimum/maximum flat work group sizes + /// for function \p F, or minimum/maximum flat work group sizes explicitly + /// requested using "amdgpu-flat-work-group-size" attribute attached to + /// function \p F. + /// + /// \returns Subtarget's default values if explicitly requested values cannot + /// be converted to integer, or violate subtarget's specifications. + std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; + + /// \returns Subtarget's default pair of minimum/maximum number of waves per + /// execution unit for function \p F, or minimum/maximum number of waves per + /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute + /// attached to function \p F. + /// + /// \returns Subtarget's default values if explicitly requested values cannot + /// be converted to integer, violate subtarget's specifications, or are not + /// compatible with minimum/maximum number of waves limited by flat work group + /// size, register usage, and/or lds usage. + std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; }; class R600Subtarget final : public AMDGPUSubtarget { @@ -328,14 +479,14 @@ public: short getTexVTXClauseSize() const { return TexVTXClauseSize; } - - unsigned getStackEntrySize() const; }; class SISubtarget final : public AMDGPUSubtarget { public: enum { - FIXED_SGPR_COUNT_FOR_INIT_BUG = 80 + // The closed Vulkan driver sets 96, which limits the wave count to 8 but + // doesn't spill SGPRs as much as when 80 is set. + FIXED_SGPR_COUNT_FOR_INIT_BUG = 96 }; private: @@ -378,10 +529,6 @@ public: bool isVGPRSpillingEnabled(const Function& F) const; - unsigned getAmdKernelCodeChipID() const; - - AMDGPU::IsaVersion getIsaVersion() const; - unsigned getMaxNumUserSGPRs() const { return 16; } @@ -394,8 +541,24 @@ public: return HasSMemRealTime; } - bool has16BitInsts() const { - return Has16BitInsts; + bool hasMovrel() const { + return HasMovrel; + } + + bool hasVGPRIndexMode() const { + return HasVGPRIndexMode; + } + + bool hasScalarCompareEq64() const { + return getGeneration() >= VOLCANIC_ISLANDS; + } + + bool hasScalarStores() const { + return HasScalarStores; + } + + bool hasInv2PiInlineImm() const { + return HasInv2PiInlineImm; } bool enableSIScheduler() const { @@ -426,37 +589,28 @@ public: bool hasSGPRInitBug() const { return SGPRInitBug; } -}; - - -inline const AMDGPUInstrInfo *AMDGPUSubtarget::getInstrInfo() const { - if (getGeneration() >= SOUTHERN_ISLANDS) - return static_cast<const SISubtarget *>(this)->getInstrInfo(); - - return static_cast<const R600Subtarget *>(this)->getInstrInfo(); -} -inline const AMDGPUFrameLowering *AMDGPUSubtarget::getFrameLowering() const { - if (getGeneration() >= SOUTHERN_ISLANDS) - return static_cast<const SISubtarget *>(this)->getFrameLowering(); + bool has12DWordStoreHazard() const { + return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; + } - return static_cast<const R600Subtarget *>(this)->getFrameLowering(); -} + unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const; -inline const AMDGPUTargetLowering *AMDGPUSubtarget::getTargetLowering() const { - if (getGeneration() >= SOUTHERN_ISLANDS) - return static_cast<const SISubtarget *>(this)->getTargetLowering(); + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs + unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; - return static_cast<const R600Subtarget *>(this)->getTargetLowering(); -} + /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs + unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; -inline const AMDGPURegisterInfo *AMDGPUSubtarget::getRegisterInfo() const { - if (getGeneration() >= SOUTHERN_ISLANDS) - return static_cast<const SISubtarget *>(this)->getRegisterInfo(); + /// \returns True if waitcnt instruction is needed before barrier instruction, + /// false otherwise. + bool needWaitcntBeforeBarrier() const { + return true; + } - return static_cast<const R600Subtarget *>(this)->getRegisterInfo(); -} + unsigned getMaxNumSGPRs() const; +}; -} // End namespace llvm +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b2d4e11..d8a0c71 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -18,28 +18,32 @@ #include "AMDGPUCallLowering.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" -#include "R600ISelLowering.h" -#include "R600InstrInfo.h" +#include "GCNSchedStrategy.h" #include "R600MachineScheduler.h" -#include "SIISelLowering.h" -#include "SIInstrInfo.h" - -#include "llvm/Analysis/Passes.h" +#include "SIMachineScheduler.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/Verifier.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/raw_os_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Vectorize.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include <memory> using namespace llvm; @@ -64,13 +68,20 @@ static cl::opt<bool> EnableR600IfConvert( static cl::opt<bool> EnableLoadStoreVectorizer( "amdgpu-load-store-vectorizer", cl::desc("Enable load store vectorizer"), + cl::init(true), + cl::Hidden); + +// Option to to control global loads scalarization +static cl::opt<bool> ScalarizeGlobal( + "amdgpu-scalarize-global-loads", + cl::desc("Enable global load scalarization"), cl::init(false), cl::Hidden); extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target - RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); - RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); + RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); + RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); PassRegistry *PR = PassRegistry::getPassRegistry(); initializeSILowerI1CopiesPass(*PR); @@ -83,20 +94,36 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); + initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); - initializeSIDebuggerInsertNopsPass(*PR); initializeSIInsertWaitsPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); + initializeSIInsertSkipsPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); + initializeSIOptimizeExecMaskingPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { - return make_unique<AMDGPUTargetObjectFile>(); + return llvm::make_unique<AMDGPUTargetObjectFile>(); } static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { - return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); + return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>()); +} + +static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { + return new SIScheduleDAGMI(C); +} + +static ScheduleDAGInstrs * +createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { + ScheduleDAGMILive *DAG = + new ScheduleDAGMILive(C, + llvm::make_unique<GCNMaxOccupancySchedStrategy>(C)); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + return DAG; } static MachineSchedRegistry @@ -107,6 +134,11 @@ static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler); +static MachineSchedRegistry +GCNMaxOccupancySchedRegistry("gcn-max-occupancy", + "Run GCN scheduler to maximize occupancy", + createGCNMaxOccupancyMachineScheduler); + static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. @@ -147,13 +179,11 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OptLevel) : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), - TLOF(createTLOF(getTargetTriple())), - IntrinsicInfo() { - setRequiresStructuredCFG(true); + TLOF(createTLOF(getTargetTriple())) { initAsmInfo(); } -AMDGPUTargetMachine::~AMDGPUTargetMachine() { } +AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { Attribute GPUAttr = F.getFnAttribute("target-cpu"); @@ -169,6 +199,10 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { FSAttr.getValueAsString(); } +void AMDGPUTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) { + PM.add(createAMDGPUUnifyMetadataPass()); +} + //===----------------------------------------------------------------------===// // R600 Target Machine (R600 -> Cayman) //===----------------------------------------------------------------------===// @@ -178,7 +212,9 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, TargetOptions Options, Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} + : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { + setRequiresStructuredCFG(true); +} const R600Subtarget *R600TargetMachine::getSubtargetImpl( const Function &F) const { @@ -206,13 +242,15 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl( #ifdef LLVM_BUILD_GLOBAL_ISEL namespace { + struct SIGISelActualAccessor : public GISelAccessor { std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; const AMDGPUCallLowering *getCallLowering() const override { return CallLoweringInfo.get(); } }; -} // End anonymous namespace. + +} // end anonymous namespace #endif GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, @@ -248,6 +286,8 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { I->setGISelAccessor(*GISel); } + I->setScalarizeGlobalBehavior(ScalarizeGlobal); + return I.get(); } @@ -261,7 +301,6 @@ class AMDGPUPassConfig : public TargetPassConfig { public: AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { - // Exceptions and StackMaps are not supported, so these passes will never do // anything. disablePass(&StackMapLivenessID); @@ -272,6 +311,14 @@ public: return getTM<AMDGPUTargetMachine>(); } + ScheduleDAGInstrs * + createMachineScheduler(MachineSchedContext *C) const override { + ScheduleDAGMILive *DAG = createGenericSchedLive(C); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + return DAG; + } + void addEarlyCSEOrGVNPass(); void addStraightLineScalarOptimizationPasses(); void addIRPasses() override; @@ -284,7 +331,7 @@ public: class R600PassConfig final : public AMDGPUPassConfig { public: R600PassConfig(TargetMachine *TM, PassManagerBase &PM) - : AMDGPUPassConfig(TM, PM) { } + : AMDGPUPassConfig(TM, PM) {} ScheduleDAGInstrs *createMachineScheduler( MachineSchedContext *C) const override { @@ -300,7 +347,7 @@ public: class GCNPassConfig final : public AMDGPUPassConfig { public: GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) - : AMDGPUPassConfig(TM, PM) { } + : AMDGPUPassConfig(TM, PM) {} GCNTargetMachine &getGCNTargetMachine() const { return getTM<GCNTargetMachine>(); @@ -315,16 +362,19 @@ public: bool addInstSelector() override; #ifdef LLVM_BUILD_GLOBAL_ISEL bool addIRTranslator() override; + bool addLegalizeMachineIR() override; bool addRegBankSelect() override; + bool addGlobalInstructionSelect() override; #endif void addFastRegAlloc(FunctionPass *RegAllocPass) override; void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; void addPreRegAlloc() override; + void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; }; -} // End of anonymous namespace +} // end anonymous namespace TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { return TargetIRAnalysis([this](const Function &F) { @@ -363,7 +413,7 @@ void AMDGPUPassConfig::addIRPasses() { // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); - addPass(createAlwaysInlinerPass()); + addPass(createAlwaysInlinerLegacyPass()); // We need to add the barrier noop pass, otherwise adding the function // inlining pass will cause all of the PassConfigs passes to be run // one function at a time, which means if we have a nodule with two @@ -380,9 +430,9 @@ void AMDGPUPassConfig::addIRPasses() { if (EnableSROA) addPass(createSROAPass()); - } - addStraightLineScalarOptimizationPasses(); + addStraightLineScalarOptimizationPasses(); + } TargetPassConfig::addIRPasses(); @@ -415,7 +465,7 @@ bool AMDGPUPassConfig::addPreISel() { } bool AMDGPUPassConfig::addInstSelector() { - addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); + addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel())); return false; } @@ -468,7 +518,7 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>(); if (ST.enableSIScheduler()) return createSIMachineScheduler(C); - return nullptr; + return createGCNMaxOccupancyMachineScheduler(C); } bool GCNPassConfig::addPreISel() { @@ -498,6 +548,7 @@ void GCNPassConfig::addMachineSSAOptimization() { // XXX - Can we get away without running DeadMachineInstructionElim again? addPass(&SIFoldOperandsID); addPass(&DeadMachineInstructionElimID); + addPass(&SILoadStoreOptimizerID); } void GCNPassConfig::addIRPasses() { @@ -520,43 +571,54 @@ bool GCNPassConfig::addIRTranslator() { return false; } +bool GCNPassConfig::addLegalizeMachineIR() { + return false; +} + bool GCNPassConfig::addRegBankSelect() { return false; } + +bool GCNPassConfig::addGlobalInstructionSelect() { + return false; +} #endif void GCNPassConfig::addPreRegAlloc() { - // This needs to be run directly before register allocation because - // earlier passes might recompute live intervals. - // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass - if (getOptLevel() > CodeGenOpt::None) { - insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); - } - - if (getOptLevel() > CodeGenOpt::None) { - // Don't do this with no optimizations since it throws away debug info by - // merging nonadjacent loads. - - // This should be run after scheduling, but before register allocation. It - // also need extra copies to the address operand to be eliminated. - - // FIXME: Move pre-RA and remove extra reg coalescer run. - insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); - insertPass(&MachineSchedulerID, &RegisterCoalescerID); - } - addPass(createSIShrinkInstructionsPass()); addPass(createSIWholeQuadModePass()); } void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { + // FIXME: We have to disable the verifier here because of PHIElimination + + // TwoAddressInstructions disabling it. + + // This must be run immediately after phi elimination and before + // TwoAddressInstructions, otherwise the processing of the tied operand of + // SI_ELSE will introduce a copy of the tied operand source after the else. + insertPass(&PHIEliminationID, &SILowerControlFlowID, false); + TargetPassConfig::addFastRegAlloc(RegAllocPass); } void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { + // This needs to be run directly before register allocation because earlier + // passes might recompute live intervals. + insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); + + // This must be run immediately after phi elimination and before + // TwoAddressInstructions, otherwise the processing of the tied operand of + // SI_ELSE will introduce a copy of the tied operand source after the else. + insertPass(&PHIEliminationID, &SILowerControlFlowID, false); + TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } +void GCNPassConfig::addPostRegAlloc() { + addPass(&SIOptimizeExecMaskingID); + TargetPassConfig::addPostRegAlloc(); +} + void GCNPassConfig::addPreSched2() { } @@ -573,8 +635,9 @@ void GCNPassConfig::addPreEmitPass() { addPass(createSIInsertWaitsPass()); addPass(createSIShrinkInstructionsPass()); - addPass(createSILowerControlFlowPass()); + addPass(&SIInsertSkipsPassID); addPass(createSIDebuggerInsertNopsPass()); + addPass(&BranchRelaxationPassID); } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index b0eb3a9..9496773 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -17,6 +17,13 @@ #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Target/TargetMachine.h" +#include <memory> namespace llvm { @@ -37,10 +44,10 @@ public: StringRef FS, TargetOptions Options, Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL); - ~AMDGPUTargetMachine(); + ~AMDGPUTargetMachine() override; const AMDGPUSubtarget *getSubtargetImpl() const; - const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override; + const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override = 0; const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { return &IntrinsicInfo; @@ -50,6 +57,7 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + void addEarlyAsPossiblePasses(PassManagerBase &PM) override; }; //===----------------------------------------------------------------------===// @@ -90,13 +98,6 @@ public: const SISubtarget *getSubtargetImpl(const Function &) const override; }; -inline const AMDGPUSubtarget *AMDGPUTargetMachine::getSubtargetImpl( - const Function &F) const { - if (getTargetTriple().getArch() == Triple::amdgcn) - return static_cast<const GCNTargetMachine *>(this)->getSubtargetImpl(F); - return static_cast<const R600TargetMachine *>(this)->getSubtargetImpl(F); -} +} // end namespace llvm -} // End namespace llvm - -#endif +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp index 03d1e2c..1fddc88 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -9,10 +9,10 @@ #include "AMDGPUTargetObjectFile.h" #include "AMDGPU.h" -#include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/Support/ELF.h" +#include "Utils/AMDGPUBaseInfo.h" using namespace llvm; @@ -20,12 +20,11 @@ using namespace llvm; // Generic Object File //===----------------------------------------------------------------------===// -MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, - SectionKind Kind, - Mangler &Mang, - const TargetMachine &TM) const { - if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV)) +MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal( + const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { + if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO) && + AMDGPU::shouldEmitConstantsToTextSection(TM.getTargetTriple())) return TextSection; - return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM); + return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h index f530e09..de32778 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -23,8 +23,7 @@ namespace llvm { class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF { public: - MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, - Mangler &Mang, + MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override; }; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 3d630fe..e904870 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -80,7 +80,7 @@ unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { return Vector ? 0 : 32; } -unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) { +unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { switch (AddrSpace) { case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS: @@ -110,7 +110,7 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { int AMDGPUTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) { EVT OrigTy = TLI->getValueType(DL, Ty); if (!OrigTy.isSimple()) { @@ -241,6 +241,7 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::amdgcn_interp_mov: case Intrinsic::amdgcn_interp_p1: case Intrinsic::amdgcn_interp_p2: case Intrinsic::amdgcn_mbcnt_hi: diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index a82a074..0d83b2a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -64,13 +64,6 @@ public: ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} - // Provide value semantics. MSVC requires that we spell all of these out. - AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) - : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} - AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg) - : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), - TLI(std::move(Arg.TLI)) {} - bool hasBranchDivergence() { return true; } void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); @@ -82,7 +75,7 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); - unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace); + unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; unsigned getMaxInterleaveFactor(unsigned VF); int getArithmeticInstrCost( @@ -90,7 +83,8 @@ public: TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef<const Value *> Args = ArrayRef<const Value *>()); unsigned getCFInstrCost(unsigned Opcode); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp new file mode 100644 index 0000000..bf501a1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp @@ -0,0 +1,149 @@ +//===-- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// \brief This pass that unifies multiple OpenCL metadata due to linking. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace { + namespace kOCLMD { + const char SpirVer[] = "opencl.spir.version"; + const char OCLVer[] = "opencl.ocl.version"; + const char UsedExt[] = "opencl.used.extensions"; + const char UsedOptCoreFeat[] = "opencl.used.optional.core.features"; + const char CompilerOptions[] = "opencl.compiler.options"; + const char LLVMIdent[] = "llvm.ident"; + } + + /// \brief Unify multiple OpenCL metadata due to linking. + class AMDGPUUnifyMetadata : public FunctionPass { + public: + static char ID; + explicit AMDGPUUnifyMetadata() : FunctionPass(ID) {}; + + private: + // This should really be a module pass but we have to run it as early + // as possible, so given function passes are executed first and + // TargetMachine::addEarlyAsPossiblePasses() expects only function passes + // it has to be a function pass. + virtual bool runOnModule(Module &M); + + // \todo: Convert to a module pass. + virtual bool runOnFunction(Function &F); + + /// \brief Unify version metadata. + /// \return true if changes are made. + /// Assume the named metadata has operands each of which is a pair of + /// integer constant, e.g. + /// !Name = {!n1, !n2} + /// !n1 = {i32 1, i32 2} + /// !n2 = {i32 2, i32 0} + /// Keep the largest version as the sole operand if PickFirst is false. + /// Otherwise pick it from the first value, representing kernel module. + bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) { + auto NamedMD = M.getNamedMetadata(Name); + if (!NamedMD || NamedMD->getNumOperands() <= 1) + return false; + MDNode *MaxMD = nullptr; + auto MaxVer = 0U; + for (const auto &VersionMD : NamedMD->operands()) { + assert(VersionMD->getNumOperands() == 2); + auto CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0)); + auto VersionMajor = CMajor->getZExtValue(); + auto CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1)); + auto VersionMinor = CMinor->getZExtValue(); + auto Ver = (VersionMajor * 100) + (VersionMinor * 10); + if (Ver > MaxVer) { + MaxVer = Ver; + MaxMD = VersionMD; + } + if (PickFirst) + break; + } + NamedMD->eraseFromParent(); + NamedMD = M.getOrInsertNamedMetadata(Name); + NamedMD->addOperand(MaxMD); + return true; + } + + /// \brief Unify version metadata. + /// \return true if changes are made. + /// Assume the named metadata has operands each of which is a list e.g. + /// !Name = {!n1, !n2} + /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}} + /// !n2 = !{!"cl_khr_image"} + /// Combine it into a single list with unique operands. + bool unifyExtensionMD(Module &M, StringRef Name) { + auto NamedMD = M.getNamedMetadata(Name); + if (!NamedMD || NamedMD->getNumOperands() == 1) + return false; + + SmallVector<Metadata *, 4> All; + for (const auto &MD : NamedMD->operands()) + for (const auto &Op : MD->operands()) + if (std::find(All.begin(), All.end(), Op.get()) == All.end()) + All.push_back(Op.get()); + + NamedMD->eraseFromParent(); + NamedMD = M.getOrInsertNamedMetadata(Name); + for (const auto &MD : All) + NamedMD->addOperand(MDNode::get(M.getContext(), MD)); + + return true; + } +}; + +} // end anonymous namespace + +char AMDGPUUnifyMetadata::ID = 0; + +char &llvm::AMDGPUUnifyMetadataID = AMDGPUUnifyMetadata::ID; + +INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata", + "Unify multiple OpenCL metadata due to linking", + false, false) + +FunctionPass* llvm::createAMDGPUUnifyMetadataPass() { + return new AMDGPUUnifyMetadata(); +} + +bool AMDGPUUnifyMetadata::runOnModule(Module &M) { + const char* Vers[] = { + kOCLMD::SpirVer, + kOCLMD::OCLVer + }; + const char* Exts[] = { + kOCLMD::UsedExt, + kOCLMD::UsedOptCoreFeat, + kOCLMD::CompilerOptions, + kOCLMD::LLVMIdent + }; + + bool Changed = false; + + for (auto &I : Vers) + Changed |= unifyVersionMD(M, I, true); + + for (auto &I : Exts) + Changed |= unifyExtensionMD(M, I); + + return Changed; +} + +bool AMDGPUUnifyMetadata::runOnFunction(Function &F) { + return runOnModule(*F.getParent()); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 21de763..7faeccd 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -18,7 +18,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" @@ -139,16 +138,15 @@ public: initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry()); } - const char *getPassName() const override { + StringRef getPassName() const override { return "AMDGPU Control Flow Graph structurizer Pass"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addPreserved<MachineFunctionAnalysis>(); - AU.addRequired<MachineFunctionAnalysis>(); AU.addRequired<MachineDominatorTree>(); AU.addRequired<MachinePostDominatorTree>(); AU.addRequired<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); } /// Perform the CFG structurization @@ -220,7 +218,8 @@ protected: bool needMigrateBlock(MachineBasicBlock *MBB) const; // Utility Functions - void reversePredicateSetter(MachineBasicBlock::iterator I); + void reversePredicateSetter(MachineBasicBlock::iterator I, + MachineBasicBlock &MBB); /// Compute the reversed DFS post order of Blocks void orderBlocks(MachineFunction *MF); @@ -422,26 +421,24 @@ bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const { } void AMDGPUCFGStructurizer::reversePredicateSetter( - MachineBasicBlock::iterator I) { - assert(static_cast<MachineInstr *>(I) && "Expected valid iterator"); + MachineBasicBlock::iterator I, MachineBasicBlock &MBB) { + assert(I.isValid() && "Expected valid iterator"); for (;; --I) { + if (I == MBB.end()) + continue; if (I->getOpcode() == AMDGPU::PRED_X) { - switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) { - case OPCODE_IS_ZERO_INT: - static_cast<MachineInstr *>(I)->getOperand(2) - .setImm(OPCODE_IS_NOT_ZERO_INT); + switch (I->getOperand(2).getImm()) { + case AMDGPU::PRED_SETE_INT: + I->getOperand(2).setImm(AMDGPU::PRED_SETNE_INT); return; - case OPCODE_IS_NOT_ZERO_INT: - static_cast<MachineInstr *>(I)->getOperand(2) - .setImm(OPCODE_IS_ZERO_INT); + case AMDGPU::PRED_SETNE_INT: + I->getOperand(2).setImm(AMDGPU::PRED_SETE_INT); return; - case OPCODE_IS_ZERO: - static_cast<MachineInstr *>(I)->getOperand(2) - .setImm(OPCODE_IS_NOT_ZERO); + case AMDGPU::PRED_SETE: + I->getOperand(2).setImm(AMDGPU::PRED_SETNE); return; - case OPCODE_IS_NOT_ZERO: - static_cast<MachineInstr *>(I)->getOperand(2) - .setImm(OPCODE_IS_ZERO); + case AMDGPU::PRED_SETNE: + I->getOperand(2).setImm(AMDGPU::PRED_SETE); return; default: llvm_unreachable("PRED_X Opcode invalid!"); @@ -841,7 +838,7 @@ bool AMDGPUCFGStructurizer::run() { } //while, "one iteration" over the function. MachineBasicBlock *EntryMBB = - &*GraphTraits<MachineFunction *>::nodes_begin(FuncRep); + *GraphTraits<MachineFunction *>::nodes_begin(FuncRep); if (EntryMBB->succ_size() == 0) { Finish = true; DEBUG( @@ -864,7 +861,7 @@ bool AMDGPUCFGStructurizer::run() { } while (!Finish && MakeProgress); // Misc wrap up to maintain the consistency of the Function representation. - wrapup(&*GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); + wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); // Detach retired Block, release memory. for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end(); @@ -908,9 +905,9 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { //walk through all the block in func to check for unreachable typedef GraphTraits<MachineFunction *> GTM; - MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF); + auto It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF); for (; It != E; ++It) { - MachineBasicBlock *MBB = &(*It); + MachineBasicBlock *MBB = *It; SccNum = getSCCNum(MBB); if (SccNum == INVALIDSCCNUM) dbgs() << "unreachable block BB" << MBB->getNumber() << "\n"; @@ -995,7 +992,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { // Triangle pattern, true is empty // We reverse the predicate to make a triangle, empty false pattern; std::swap(TrueMBB, FalseMBB); - reversePredicateSetter(MBB->end()); + reversePredicateSetter(MBB->end(), *MBB); LandBlk = FalseMBB; FalseMBB = nullptr; } else if (FalseMBB->succ_size() == 1 @@ -1505,7 +1502,7 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI); MachineBasicBlock::iterator I = BranchMI; if (TrueBranch != LandMBB) - reversePredicateSetter(I); + reversePredicateSetter(I, *I->getParent()); insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL); insertInstrBefore(I, AMDGPU::BREAK); insertInstrBefore(I, AMDGPU::ENDIF); diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index efcf1b2..3cf9a1d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -15,38 +15,62 @@ #include "Utils/AMDKernelCodeTUtils.h" #include "Utils/AMDGPUAsmUtils.h" #include "llvm/ADT/APFloat.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCAsmParserExtension.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ELF.h" -#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/SMLoc.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/MathExtras.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> +#include <iterator> +#include <map> +#include <memory> +#include <string> +#include <vector> using namespace llvm; +using namespace llvm::AMDGPU; namespace { -struct OptionalOperand; +class AMDGPUAsmParser; enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_TTMP, IS_SPECIAL }; +//===----------------------------------------------------------------------===// +// Operand +//===----------------------------------------------------------------------===// + class AMDGPUOperand : public MCParsedAsmOperand { enum KindTy { Token, @@ -56,16 +80,18 @@ class AMDGPUOperand : public MCParsedAsmOperand { } Kind; SMLoc StartLoc, EndLoc; + const AMDGPUAsmParser *AsmParser; public: - AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {} + AMDGPUOperand(enum KindTy Kind_, const AMDGPUAsmParser *AsmParser_) + : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {} typedef std::unique_ptr<AMDGPUOperand> Ptr; struct Modifiers { - bool Abs; - bool Neg; - bool Sext; + bool Abs = false; + bool Neg = false; + bool Sext = false; bool hasFPModifiers() const { return Abs || Neg; } bool hasIntModifiers() const { return Sext; } @@ -126,8 +152,15 @@ public: ImmTyDA, ImmTyR128, ImmTyLWE, + ImmTyExpTgt, + ImmTyExpCompr, + ImmTyExpVM, ImmTyHwreg, + ImmTyOff, ImmTySendMsg, + ImmTyInterpSlot, + ImmTyInterpAttr, + ImmTyAttrChan }; struct TokOp { @@ -136,18 +169,16 @@ public: }; struct ImmOp { - bool IsFPImm; - ImmTy Type; int64_t Val; + ImmTy Type; + bool IsFPImm; Modifiers Mods; }; struct RegOp { unsigned RegNo; - Modifiers Mods; - const MCRegisterInfo *TRI; - const MCSubtargetInfo *STI; bool IsForcedVOP3; + Modifiers Mods; }; union { @@ -175,41 +206,66 @@ public: return Kind == Immediate; } - bool isInlinableImm() const { - if (!isImmTy(ImmTyNone)) { - // Only plain immediates are inlinable (e.g. "clamp" attribute is not) - return false; - } - // TODO: We should avoid using host float here. It would be better to - // check the float bit values which is what a few other places do. - // We've had bot failures before due to weird NaN support on mips hosts. - const float F = BitsToFloat(Imm.Val); - // TODO: Add 1/(2*pi) for VI - return (Imm.Val <= 64 && Imm.Val >= -16) || - (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 || - F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0); - } + bool isInlinableImm(MVT type) const; + bool isLiteralImm(MVT type) const; bool isRegKind() const { return Kind == Register; } bool isReg() const override { - return isRegKind() && !Reg.Mods.hasModifiers(); + return isRegKind() && !hasModifiers(); + } + + bool isRegOrImmWithInputMods(MVT type) const { + return isRegKind() || isInlinableImm(type); + } + + bool isRegOrImmWithInt16InputMods() const { + return isRegOrImmWithInputMods(MVT::i16); + } + + bool isRegOrImmWithInt32InputMods() const { + return isRegOrImmWithInputMods(MVT::i32); + } + + bool isRegOrImmWithInt64InputMods() const { + return isRegOrImmWithInputMods(MVT::i64); + } + + bool isRegOrImmWithFP16InputMods() const { + return isRegOrImmWithInputMods(MVT::f16); } - bool isRegOrImmWithInputMods() const { - return isRegKind() || isInlinableImm(); + bool isRegOrImmWithFP32InputMods() const { + return isRegOrImmWithInputMods(MVT::f32); + } + + bool isRegOrImmWithFP64InputMods() const { + return isRegOrImmWithInputMods(MVT::f64); + } + + bool isVReg() const { + return isRegClass(AMDGPU::VGPR_32RegClassID) || + isRegClass(AMDGPU::VReg_64RegClassID) || + isRegClass(AMDGPU::VReg_96RegClassID) || + isRegClass(AMDGPU::VReg_128RegClassID) || + isRegClass(AMDGPU::VReg_256RegClassID) || + isRegClass(AMDGPU::VReg_512RegClassID); + } + + bool isVReg32OrOff() const { + return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID); } bool isImmTy(ImmTy ImmT) const { return isImm() && Imm.Type == ImmT; } - + bool isImmModifier() const { return isImm() && Imm.Type != ImmTyNone; } - + bool isClampSI() const { return isImmTy(ImmTyClampSI); } bool isOModSI() const { return isImmTy(ImmTyOModSI); } bool isDMask() const { return isImmTy(ImmTyDMask); } @@ -217,6 +273,10 @@ public: bool isDA() const { return isImmTy(ImmTyDA); } bool isR128() const { return isImmTy(ImmTyUNorm); } bool isLWE() const { return isImmTy(ImmTyLWE); } + bool isOff() const { return isImmTy(ImmTyOff); } + bool isExpTgt() const { return isImmTy(ImmTyExpTgt); } + bool isExpVM() const { return isImmTy(ImmTyExpVM); } + bool isExpCompr() const { return isImmTy(ImmTyExpCompr); } bool isOffen() const { return isImmTy(ImmTyOffen); } bool isIdxen() const { return isImmTy(ImmTyIdxen); } bool isAddr64() const { return isImmTy(ImmTyAddr64); } @@ -234,7 +294,10 @@ public: bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); } bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); } bool isSDWADstUnused() const { return isImmTy(ImmTySdwaDstUnused); } - + bool isInterpSlot() const { return isImmTy(ImmTyInterpSlot); } + bool isInterpAttr() const { return isImmTy(ImmTyInterpAttr); } + bool isAttrChan() const { return isImmTy(ImmTyAttrChan); } + bool isMod() const { return isClampSI() || isOModSI(); } @@ -243,47 +306,116 @@ public: return isReg() || isImm(); } - bool isRegClass(unsigned RCID) const { - return isReg() && Reg.TRI->getRegClass(RCID).contains(getReg()); + bool isRegClass(unsigned RCID) const; + + bool isRegOrInlineNoMods(unsigned RCID, MVT type) const { + return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers(); } - bool isSCSrc32() const { - return isInlinableImm() || isRegClass(AMDGPU::SReg_32RegClassID); + bool isSCSrcB16() const { + return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16); } - bool isSCSrc64() const { - return isInlinableImm() || isRegClass(AMDGPU::SReg_64RegClassID); + bool isSCSrcB32() const { + return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32); } - bool isSSrc32() const { - return isImm() || isSCSrc32() || isExpr(); + bool isSCSrcB64() const { + return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::i64); } - bool isSSrc64() const { + bool isSCSrcF16() const { + return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16); + } + + bool isSCSrcF32() const { + return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32); + } + + bool isSCSrcF64() const { + return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::f64); + } + + bool isSSrcB32() const { + return isSCSrcB32() || isLiteralImm(MVT::i32) || isExpr(); + } + + bool isSSrcB16() const { + return isSCSrcB16() || isLiteralImm(MVT::i16); + } + + bool isSSrcB64() const { // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits. // See isVSrc64(). - return isImm() || isSCSrc64(); + return isSCSrcB64() || isLiteralImm(MVT::i64); + } + + bool isSSrcF32() const { + return isSCSrcB32() || isLiteralImm(MVT::f32) || isExpr(); + } + + bool isSSrcF64() const { + return isSCSrcB64() || isLiteralImm(MVT::f64); + } + + bool isSSrcF16() const { + return isSCSrcB16() || isLiteralImm(MVT::f16); + } + + bool isVCSrcB32() const { + return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32); + } + + bool isVCSrcB64() const { + return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64); + } + + bool isVCSrcB16() const { + return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16); + } + + bool isVCSrcF32() const { + return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32); + } + + bool isVCSrcF64() const { + return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64); + } + + bool isVCSrcF16() const { + return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16); + } + + bool isVSrcB32() const { + return isVCSrcF32() || isLiteralImm(MVT::i32); + } + + bool isVSrcB64() const { + return isVCSrcF64() || isLiteralImm(MVT::i64); } - bool isVCSrc32() const { - return isInlinableImm() || isRegClass(AMDGPU::VS_32RegClassID); + bool isVSrcB16() const { + return isVCSrcF16() || isLiteralImm(MVT::i16); } - bool isVCSrc64() const { - return isInlinableImm() || isRegClass(AMDGPU::VS_64RegClassID); + bool isVSrcF32() const { + return isVCSrcF32() || isLiteralImm(MVT::f32); } - bool isVSrc32() const { - return isImm() || isVCSrc32(); + bool isVSrcF64() const { + return isVCSrcF64() || isLiteralImm(MVT::f64); } - bool isVSrc64() const { - // TODO: Check if the 64-bit value (coming from assembly source) can be - // narrowed to 32 bits (in the instruction stream). That require knowledge - // of instruction type (unsigned/signed, floating or "untyped"/B64), - // see [AMD GCN3 ISA 6.3.1]. - // TODO: How 64-bit values are formed from 32-bit literals in _B64 insns? - return isImm() || isVCSrc64(); + bool isVSrcF16() const { + return isVCSrcF16() || isLiteralImm(MVT::f16); + } + + bool isKImmFP32() const { + return isLiteralImm(MVT::f32); + } + + bool isKImmFP16() const { + return isLiteralImm(MVT::f16); } bool isMem() const override { @@ -301,9 +433,11 @@ public: bool isSWaitCnt() const; bool isHwreg() const; bool isSendMsg() const; - bool isSMRDOffset() const; + bool isSMRDOffset8() const; + bool isSMRDOffset20() const; bool isSMRDLiteralOffset() const; bool isDPPCtrl() const; + bool isGPRIdxMode() const; StringRef getExpressionAsToken() const { assert(isExpr()); @@ -311,7 +445,6 @@ public: return S->getSymbol().getName(); } - StringRef getToken() const { assert(isToken()); @@ -359,7 +492,7 @@ public: bool hasModifiers() const { return getModifiers().hasModifiers(); } - + bool hasFPModifiers() const { return getModifiers().hasFPModifiers(); } @@ -368,30 +501,23 @@ public: return getModifiers().hasIntModifiers(); } - void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const { - if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers()) { - // Apply modifiers to immediate value - int64_t Val = Imm.Val; - bool Negate = Imm.Mods.Neg; // Only negate can get here - if (Imm.IsFPImm) { - APFloat F(BitsToFloat(Val)); - if (Negate) { - F.changeSign(); - } - Val = F.bitcastToAPInt().getZExtValue(); - } else { - Val = Negate ? -Val : Val; - } - Inst.addOperand(MCOperand::createImm(Val)); - } else { - Inst.addOperand(MCOperand::createImm(getImm())); - } + void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const; + + void addLiteralImmOperand(MCInst &Inst, int64_t Val) const; + + template <unsigned Bitwidth> + void addKImmFPOperands(MCInst &Inst, unsigned N) const; + + void addKImmFP16Operands(MCInst &Inst, unsigned N) const { + addKImmFPOperands<16>(Inst, N); } - void addRegOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI))); + void addKImmFP32Operands(MCInst &Inst, unsigned N) const { + addKImmFPOperands<32>(Inst, N); } + void addRegOperands(MCInst &Inst, unsigned N) const; + void addRegOrImmOperands(MCInst &Inst, unsigned N) const { if (isRegKind()) addRegOperands(Inst, N); @@ -421,6 +547,23 @@ public: addRegOrImmWithInputModsOperands(Inst, N); } + void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const { + Modifiers Mods = getModifiers(); + Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand())); + assert(isRegKind()); + addRegOperands(Inst, N); + } + + void addRegWithFPInputModsOperands(MCInst &Inst, unsigned N) const { + assert(!hasIntModifiers()); + addRegWithInputModsOperands(Inst, N); + } + + void addRegWithIntInputModsOperands(MCInst &Inst, unsigned N) const { + assert(!hasFPModifiers()); + addRegWithInputModsOperands(Inst, N); + } + void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { if (isImm()) addImmOperands(Inst, N); @@ -430,7 +573,7 @@ public: } } - void printImmTy(raw_ostream& OS, ImmTy Type) const { + static void printImmTy(raw_ostream& OS, ImmTy Type) { switch (Type) { case ImmTyNone: OS << "None"; break; case ImmTyGDS: OS << "GDS"; break; @@ -458,8 +601,15 @@ public: case ImmTyDA: OS << "DA"; break; case ImmTyR128: OS << "R128"; break; case ImmTyLWE: OS << "LWE"; break; + case ImmTyOff: OS << "Off"; break; + case ImmTyExpTgt: OS << "ExpTgt"; break; + case ImmTyExpCompr: OS << "ExpCompr"; break; + case ImmTyExpVM: OS << "ExpVM"; break; case ImmTyHwreg: OS << "Hwreg"; break; case ImmTySendMsg: OS << "SendMsg"; break; + case ImmTyInterpSlot: OS << "InterpSlot"; break; + case ImmTyInterpAttr: OS << "InterpAttr"; break; + case ImmTyAttrChan: OS << "AttrChan"; break; } } @@ -484,22 +634,24 @@ public: } } - static AMDGPUOperand::Ptr CreateImm(int64_t Val, SMLoc Loc, + static AMDGPUOperand::Ptr CreateImm(const AMDGPUAsmParser *AsmParser, + int64_t Val, SMLoc Loc, enum ImmTy Type = ImmTyNone, bool IsFPImm = false) { - auto Op = llvm::make_unique<AMDGPUOperand>(Immediate); + auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser); Op->Imm.Val = Val; Op->Imm.IsFPImm = IsFPImm; Op->Imm.Type = Type; - Op->Imm.Mods = {false, false, false}; + Op->Imm.Mods = Modifiers(); Op->StartLoc = Loc; Op->EndLoc = Loc; return Op; } - static AMDGPUOperand::Ptr CreateToken(StringRef Str, SMLoc Loc, + static AMDGPUOperand::Ptr CreateToken(const AMDGPUAsmParser *AsmParser, + StringRef Str, SMLoc Loc, bool HasExplicitEncodingSize = true) { - auto Res = llvm::make_unique<AMDGPUOperand>(Token); + auto Res = llvm::make_unique<AMDGPUOperand>(Token, AsmParser); Res->Tok.Data = Str.data(); Res->Tok.Length = Str.size(); Res->StartLoc = Loc; @@ -507,24 +659,22 @@ public: return Res; } - static AMDGPUOperand::Ptr CreateReg(unsigned RegNo, SMLoc S, + static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser, + unsigned RegNo, SMLoc S, SMLoc E, - const MCRegisterInfo *TRI, - const MCSubtargetInfo *STI, bool ForceVOP3) { - auto Op = llvm::make_unique<AMDGPUOperand>(Register); + auto Op = llvm::make_unique<AMDGPUOperand>(Register, AsmParser); Op->Reg.RegNo = RegNo; - Op->Reg.TRI = TRI; - Op->Reg.STI = STI; - Op->Reg.Mods = {false, false, false}; + Op->Reg.Mods = Modifiers(); Op->Reg.IsForcedVOP3 = ForceVOP3; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static AMDGPUOperand::Ptr CreateExpr(const class MCExpr *Expr, SMLoc S) { - auto Op = llvm::make_unique<AMDGPUOperand>(Expression); + static AMDGPUOperand::Ptr CreateExpr(const AMDGPUAsmParser *AsmParser, + const class MCExpr *Expr, SMLoc S) { + auto Op = llvm::make_unique<AMDGPUOperand>(Expression, AsmParser); Op->Expr = Expr; Op->StartLoc = S; Op->EndLoc = S; @@ -537,6 +687,53 @@ raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) { return OS; } +//===----------------------------------------------------------------------===// +// AsmParser +//===----------------------------------------------------------------------===// + +// Holds info related to the current kernel, e.g. count of SGPRs used. +// Kernel scope begins at .amdgpu_hsa_kernel directive, ends at next +// .amdgpu_hsa_kernel or at EOF. +class KernelScopeInfo { + int SgprIndexUnusedMin; + int VgprIndexUnusedMin; + MCContext *Ctx; + + void usesSgprAt(int i) { + if (i >= SgprIndexUnusedMin) { + SgprIndexUnusedMin = ++i; + if (Ctx) { + MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.sgpr_count")); + Sym->setVariableValue(MCConstantExpr::create(SgprIndexUnusedMin, *Ctx)); + } + } + } + void usesVgprAt(int i) { + if (i >= VgprIndexUnusedMin) { + VgprIndexUnusedMin = ++i; + if (Ctx) { + MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count")); + Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx)); + } + } + } +public: + KernelScopeInfo() : SgprIndexUnusedMin(-1), VgprIndexUnusedMin(-1), Ctx(nullptr) + {} + void initialize(MCContext &Context) { + Ctx = &Context; + usesSgprAt(SgprIndexUnusedMin = -1); + usesVgprAt(VgprIndexUnusedMin = -1); + } + void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) { + switch (RegKind) { + case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break; + case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break; + default: break; + } + } +}; + class AMDGPUAsmParser : public MCTargetAsmParser { const MCInstrInfo &MII; MCAsmParser &Parser; @@ -544,22 +741,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { unsigned ForcedEncodingSize; bool ForcedDPP; bool ForcedSDWA; - - bool isSI() const { - return AMDGPU::isSI(getSTI()); - } - - bool isCI() const { - return AMDGPU::isCI(getSTI()); - } - - bool isVI() const { - return AMDGPU::isVI(getSTI()); - } - - bool hasSGPR102_SGPR103() const { - return !isVI(); - } + KernelScopeInfo KernelScope; /// @name Auto-generated Match Functions /// { @@ -570,9 +752,11 @@ class AMDGPUAsmParser : public MCTargetAsmParser { /// } private: + bool ParseAsAbsoluteExpression(uint32_t &Ret); bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor); bool ParseDirectiveHSACodeObjectVersion(); bool ParseDirectiveHSACodeObjectISA(); + bool ParseDirectiveRuntimeMetadata(); bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header); bool ParseDirectiveAMDKernelCodeT(); bool ParseSectionDirectiveHSAText(); @@ -584,7 +768,7 @@ private: bool ParseSectionDirectiveHSADataGlobalProgram(); bool ParseSectionDirectiveHSARodataReadonlyAgent(); bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum); - bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth); + bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex); void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn); public: @@ -622,6 +806,27 @@ public: Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx)); } + KernelScope.initialize(getContext()); + } + + bool isSI() const { + return AMDGPU::isSI(getSTI()); + } + + bool isCI() const { + return AMDGPU::isCI(getSTI()); + } + + bool isVI() const { + return AMDGPU::isVI(getSTI()); + } + + bool hasInv2PiInlineImm() const { + return getSTI().getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; + } + + bool hasSGPR102_SGPR103() const { + return !isVI(); } AMDGPUTargetStreamer &getTargetStreamer() { @@ -629,6 +834,16 @@ public: return static_cast<AMDGPUTargetStreamer &>(TS); } + const MCRegisterInfo *getMRI() const { + // We need this const_cast because for some reason getContext() is not const + // in MCAsmParser. + return const_cast<AMDGPUAsmParser*>(this)->getContext().getRegisterInfo(); + } + + const MCInstrInfo *getMII() const { + return &MII; + } + void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; } void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; } void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; } @@ -637,6 +852,7 @@ public: bool isForcedVOP3() const { return ForcedEncodingSize == 64; } bool isForcedDPP() const { return ForcedDPP; } bool isForcedSDWA() const { return ForcedSDWA; } + ArrayRef<unsigned> getMatchedVariants() const; std::unique_ptr<AMDGPUOperand> parseRegister(); bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; @@ -652,23 +868,31 @@ public: StringRef parseMnemonicSuffix(StringRef Name); bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; + //bool ProcessInstruction(MCInst &Inst); OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int); - OperandMatchResultTy parseIntWithPrefix(const char *Prefix, - OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, - bool (*ConvertResult)(int64_t&) = 0); - OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); - OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, StringRef &Value); + OperandMatchResultTy + parseIntWithPrefix(const char *Prefix, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, + bool (*ConvertResult)(int64_t &) = nullptr); + OperandMatchResultTy + parseNamedBit(const char *Name, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); + OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, + StringRef &Value); OperandMatchResultTy parseImm(OperandVector &Operands); + OperandMatchResultTy parseReg(OperandVector &Operands); OperandMatchResultTy parseRegOrImm(OperandVector &Operands); - OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands); - OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands); + OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true); + OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true); + OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands); + OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands); + OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands); void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); void cvtDS(MCInst &Inst, const OperandVector &Operands); + void cvtExp(MCInst &Inst, const OperandVector &Operands); bool parseCnt(int64_t &IntVal); OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); @@ -683,10 +907,17 @@ private: bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId); bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width); + + void errorExpTgt(); + OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val); + public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); + OperandMatchResultTy parseExpTgt(OperandVector &Operands); OperandMatchResultTy parseSendMsgOp(OperandVector &Operands); + OperandMatchResultTy parseInterpSlot(OperandVector &Operands); + OperandMatchResultTy parseInterpAttr(OperandVector &Operands); OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } @@ -701,7 +932,8 @@ public: AMDGPUOperand::Ptr defaultDA() const; AMDGPUOperand::Ptr defaultR128() const; AMDGPUOperand::Ptr defaultLWE() const; - AMDGPUOperand::Ptr defaultSMRDOffset() const; + AMDGPUOperand::Ptr defaultSMRDOffset8() const; + AMDGPUOperand::Ptr defaultSMRDOffset20() const; AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; OperandMatchResultTy parseOModOperand(OperandVector &Operands); @@ -736,8 +968,274 @@ struct OptionalOperand { bool (*ConvertResult)(int64_t&); }; +} // end anonymous namespace + +// May be called with integer type with equivalent bitwidth. +static const fltSemantics *getFltSemantics(unsigned Size) { + switch (Size) { + case 4: + return &APFloat::IEEEsingle(); + case 8: + return &APFloat::IEEEdouble(); + case 2: + return &APFloat::IEEEhalf(); + default: + llvm_unreachable("unsupported fp type"); + } +} + +static const fltSemantics *getFltSemantics(MVT VT) { + return getFltSemantics(VT.getSizeInBits() / 8); +} + +//===----------------------------------------------------------------------===// +// Operand +//===----------------------------------------------------------------------===// + +static bool canLosslesslyConvertToFPType(APFloat &FPLiteral, MVT VT) { + bool Lost; + + // Convert literal to single precision + APFloat::opStatus Status = FPLiteral.convert(*getFltSemantics(VT), + APFloat::rmNearestTiesToEven, + &Lost); + // We allow precision lost but not overflow or underflow + if (Status != APFloat::opOK && + Lost && + ((Status & APFloat::opOverflow) != 0 || + (Status & APFloat::opUnderflow) != 0)) { + return false; + } + + return true; +} + +bool AMDGPUOperand::isInlinableImm(MVT type) const { + if (!isImmTy(ImmTyNone)) { + // Only plain immediates are inlinable (e.g. "clamp" attribute is not) + return false; + } + // TODO: We should avoid using host float here. It would be better to + // check the float bit values which is what a few other places do. + // We've had bot failures before due to weird NaN support on mips hosts. + + APInt Literal(64, Imm.Val); + + if (Imm.IsFPImm) { // We got fp literal token + if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand + return AMDGPU::isInlinableLiteral64(Imm.Val, + AsmParser->hasInv2PiInlineImm()); + } + + APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val)); + if (!canLosslesslyConvertToFPType(FPLiteral, type)) + return false; + + // Check if single precision literal is inlinable + return AMDGPU::isInlinableLiteral32( + static_cast<int32_t>(FPLiteral.bitcastToAPInt().getZExtValue()), + AsmParser->hasInv2PiInlineImm()); + } + + + // We got int literal token. + if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand + return AMDGPU::isInlinableLiteral64(Imm.Val, + AsmParser->hasInv2PiInlineImm()); + } + + if (type.getScalarSizeInBits() == 16) { + return AMDGPU::isInlinableLiteral16( + static_cast<int16_t>(Literal.getLoBits(16).getSExtValue()), + AsmParser->hasInv2PiInlineImm()); + } + + return AMDGPU::isInlinableLiteral32( + static_cast<int32_t>(Literal.getLoBits(32).getZExtValue()), + AsmParser->hasInv2PiInlineImm()); +} + +bool AMDGPUOperand::isLiteralImm(MVT type) const { + // Check that this imediate can be added as literal + if (!isImmTy(ImmTyNone)) { + return false; + } + + if (!Imm.IsFPImm) { + // We got int literal token. + + unsigned Size = type.getSizeInBits(); + if (Size == 64) + Size = 32; + + // FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP + // types. + return isUIntN(Size, Imm.Val) || isIntN(Size, Imm.Val); + } + + // We got fp literal token + if (type == MVT::f64) { // Expected 64-bit fp operand + // We would set low 64-bits of literal to zeroes but we accept this literals + return true; + } + + if (type == MVT::i64) { // Expected 64-bit int operand + // We don't allow fp literals in 64-bit integer instructions. It is + // unclear how we should encode them. + return false; + } + + APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val)); + return canLosslesslyConvertToFPType(FPLiteral, type); +} + +bool AMDGPUOperand::isRegClass(unsigned RCID) const { + return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg()); } +void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const { + int64_t Val = Imm.Val; + if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers() && Imm.Mods.Neg) { + // Apply modifiers to immediate value. Only negate can get here + if (Imm.IsFPImm) { + APFloat F(BitsToDouble(Val)); + F.changeSign(); + Val = F.bitcastToAPInt().getZExtValue(); + } else { + Val = -Val; + } + } + + if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()), + Inst.getNumOperands())) { + addLiteralImmOperand(Inst, Val); + } else { + Inst.addOperand(MCOperand::createImm(Val)); + } +} + +void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { + const auto& InstDesc = AsmParser->getMII()->get(Inst.getOpcode()); + auto OpNum = Inst.getNumOperands(); + // Check that this operand accepts literals + assert(AMDGPU::isSISrcOperand(InstDesc, OpNum)); + + auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size + + if (Imm.IsFPImm) { // We got fp literal token + APInt Literal(64, Val); + + switch (OpSize) { + case 8: { + if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), + AsmParser->hasInv2PiInlineImm())) { + Inst.addOperand(MCOperand::createImm(Literal.getZExtValue())); + return; + } + + // Non-inlineable + if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand + // For fp operands we check if low 32 bits are zeros + if (Literal.getLoBits(32) != 0) { + const_cast<AMDGPUAsmParser *>(AsmParser)->Warning(Inst.getLoc(), + "Can't encode literal as exact 64-bit floating-point operand. " + "Low 32-bits will be set to zero"); + } + + Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue())); + return; + } + + // We don't allow fp literals in 64-bit integer instructions. It is + // unclear how we should encode them. This case should be checked earlier + // in predicate methods (isLiteralImm()) + llvm_unreachable("fp literal in 64-bit integer instruction."); + } + case 4: + case 2: { + bool lost; + APFloat FPLiteral(APFloat::IEEEdouble(), Literal); + // Convert literal to single precision + FPLiteral.convert(*getFltSemantics(OpSize), + APFloat::rmNearestTiesToEven, &lost); + // We allow precision lost but not overflow or underflow. This should be + // checked earlier in isLiteralImm() + Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue())); + return; + } + default: + llvm_unreachable("invalid operand size"); + } + + return; + } + + // We got int literal token. + // Only sign extend inline immediates. + // FIXME: No errors on truncation + switch (OpSize) { + case 4: { + if (isInt<32>(Val) && + AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val), + AsmParser->hasInv2PiInlineImm())) { + Inst.addOperand(MCOperand::createImm(Val)); + return; + } + + Inst.addOperand(MCOperand::createImm(Val & 0xffffffff)); + return; + } + case 8: { + if (AMDGPU::isInlinableLiteral64(Val, + AsmParser->hasInv2PiInlineImm())) { + Inst.addOperand(MCOperand::createImm(Val)); + return; + } + + Inst.addOperand(MCOperand::createImm(Lo_32(Val))); + return; + } + case 2: { + if (isInt<16>(Val) && + AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val), + AsmParser->hasInv2PiInlineImm())) { + Inst.addOperand(MCOperand::createImm(Val)); + return; + } + + Inst.addOperand(MCOperand::createImm(Val & 0xffff)); + return; + } + default: + llvm_unreachable("invalid operand size"); + } +} + +template <unsigned Bitwidth> +void AMDGPUOperand::addKImmFPOperands(MCInst &Inst, unsigned N) const { + APInt Literal(64, Imm.Val); + + if (!Imm.IsFPImm) { + // We got int literal token. + Inst.addOperand(MCOperand::createImm(Literal.getLoBits(Bitwidth).getZExtValue())); + return; + } + + bool Lost; + APFloat FPLiteral(APFloat::IEEEdouble(), Literal); + FPLiteral.convert(*getFltSemantics(Bitwidth / 8), + APFloat::rmNearestTiesToEven, &Lost); + Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue())); +} + +void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), AsmParser->getSTI()))); +} + +//===----------------------------------------------------------------------===// +// AsmParser +//===----------------------------------------------------------------------===// + static int getRegClass(RegisterKind Is, unsigned RegWidth) { if (Is == IS_VGPR) { switch (RegWidth) { @@ -818,12 +1316,13 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, R RegWidth++; return true; default: - assert(false); return false; + llvm_unreachable("unexpected register kind"); } } -bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth) +bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex) { + if (DwordRegIndex) { *DwordRegIndex = 0; } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); if (getLexer().is(AsmToken::Identifier)) { StringRef RegName = Parser.getTok().getString(); @@ -883,7 +1382,7 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, } else if (getLexer().is(AsmToken::LBrac)) { // List of consecutive registers: [s0,s1,s2,s3] Parser.Lex(); - if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) + if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, nullptr)) return false; if (RegWidth != 1) return false; @@ -895,7 +1394,7 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, } else if (getLexer().is(AsmToken::RBrac)) { Parser.Lex(); break; - } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1)) { + } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1, nullptr)) { if (RegWidth1 != 1) { return false; } @@ -923,11 +1422,12 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, { unsigned Size = 1; if (RegKind == IS_SGPR || RegKind == IS_TTMP) { - // SGPR and TTMP registers must be are aligned. Max required alignment is 4 dwords. + // SGPR and TTMP registers must be aligned. Max required alignment is 4 dwords. Size = std::min(RegWidth, 4u); } if (RegNum % Size != 0) return false; + if (DwordRegIndex) { *DwordRegIndex = RegNum; } RegNum = RegNum / Size; int RCID = getRegClass(RegKind, RegWidth); if (RCID == -1) @@ -940,7 +1440,7 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, } default: - assert(false); return false; + llvm_unreachable("unexpected register kind"); } if (!subtargetHasRegister(*TRI, Reg)) @@ -952,20 +1452,19 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() { const auto &Tok = Parser.getTok(); SMLoc StartLoc = Tok.getLoc(); SMLoc EndLoc = Tok.getEndLoc(); - const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - RegisterKind RegKind; - unsigned Reg, RegNum, RegWidth; + unsigned Reg, RegNum, RegWidth, DwordRegIndex; - if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) { + if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) { return nullptr; } - return AMDGPUOperand::CreateReg(Reg, StartLoc, EndLoc, - TRI, &getSTI(), false); + KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth); + return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false); } -AMDGPUAsmParser::OperandMatchResultTy +OperandMatchResultTy AMDGPUAsmParser::parseImm(OperandVector &Operands) { + // TODO: add syntactic sugar for 1/(2*PI) bool Minus = false; if (getLexer().getKind() == AsmToken::Minus) { Minus = true; @@ -978,28 +1477,21 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) { int64_t IntVal; if (getParser().parseAbsoluteExpression(IntVal)) return MatchOperand_ParseFail; - if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) { - Error(S, "invalid immediate: only 32-bit values are legal"); - return MatchOperand_ParseFail; - } - if (Minus) IntVal *= -1; - Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); + Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S)); return MatchOperand_Success; } case AsmToken::Real: { - // FIXME: We should emit an error if a double precisions floating-point - // value is used. I'm not sure the best way to detect this. int64_t IntVal; if (getParser().parseAbsoluteExpression(IntVal)) return MatchOperand_ParseFail; - APFloat F((float)BitsToDouble(IntVal)); + APFloat F(BitsToDouble(IntVal)); if (Minus) F.changeSign(); Operands.push_back( - AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S, + AMDGPUOperand::CreateImm(this, F.bitcastToAPInt().getZExtValue(), S, AMDGPUOperand::ImmTyNone, true)); return MatchOperand_Success; } @@ -1008,24 +1500,29 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) { } } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) { - auto res = parseImm(Operands); - if (res != MatchOperand_NoMatch) { - return res; - } - +OperandMatchResultTy +AMDGPUAsmParser::parseReg(OperandVector &Operands) { if (auto R = parseRegister()) { assert(R->isReg()); R->Reg.IsForcedVOP3 = isForcedVOP3(); Operands.push_back(std::move(R)); return MatchOperand_Success; } - return MatchOperand_ParseFail; + return MatchOperand_NoMatch; } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) { +OperandMatchResultTy +AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) { + auto res = parseImm(Operands); + if (res != MatchOperand_NoMatch) { + return res; + } + + return parseReg(Operands); +} + +OperandMatchResultTy +AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) { // XXX: During parsing we can't determine if minus sign means // negate-modifier or negative immediate value. // By default we suppose it is modifier. @@ -1055,12 +1552,17 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) { Abs = true; } - auto Res = parseRegOrImm(Operands); + OperandMatchResultTy Res; + if (AllowImm) { + Res = parseRegOrImm(Operands); + } else { + Res = parseReg(Operands); + } if (Res != MatchOperand_Success) { return Res; } - AMDGPUOperand::Modifiers Mods = {false, false, false}; + AMDGPUOperand::Modifiers Mods; if (Negate) { Mods.Neg = true; } @@ -1088,8 +1590,8 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) { return MatchOperand_Success; } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) { +OperandMatchResultTy +AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) { bool Sext = false; if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") { @@ -1102,12 +1604,17 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) { Parser.Lex(); } - auto Res = parseRegOrImm(Operands); + OperandMatchResultTy Res; + if (AllowImm) { + Res = parseRegOrImm(Operands); + } else { + Res = parseReg(Operands); + } if (Res != MatchOperand_Success) { return Res; } - AMDGPUOperand::Modifiers Mods = {false, false, false}; + AMDGPUOperand::Modifiers Mods; if (Sext) { if (getLexer().isNot(AsmToken::RParen)) { Error(Parser.getTok().getLoc(), "expected closing parentheses"); @@ -1116,14 +1623,43 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) { Parser.Lex(); Mods.Sext = true; } - + if (Mods.hasIntModifiers()) { AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back()); Op.setModifiers(Mods); } + return MatchOperand_Success; } +OperandMatchResultTy +AMDGPUAsmParser::parseRegWithFPInputMods(OperandVector &Operands) { + return parseRegOrImmWithFPInputMods(Operands, false); +} + +OperandMatchResultTy +AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) { + return parseRegOrImmWithIntInputMods(Operands, false); +} + +OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) { + std::unique_ptr<AMDGPUOperand> Reg = parseRegister(); + if (Reg) { + Operands.push_back(std::move(Reg)); + return MatchOperand_Success; + } + + const AsmToken &Tok = Parser.getTok(); + if (Tok.getString() == "off") { + Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Tok.getLoc(), + AMDGPUOperand::ImmTyOff, false)); + Parser.Lex(); + return MatchOperand_Success; + } + + return MatchOperand_NoMatch; +} + unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; @@ -1139,65 +1675,137 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { getForcedEncodingSize() != 64) return Match_PreferE32; + if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa_vi || + Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) { + // v_mac_f32/16 allow only dst_sel == DWORD; + auto OpNum = + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::dst_sel); + const auto &Op = Inst.getOperand(OpNum); + if (!Op.isImm() || Op.getImm() != AMDGPU::SDWA::SdwaSel::DWORD) { + return Match_InvalidOperand; + } + } + return Match_Success; } +// What asm variants we should check +ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const { + if (getForcedEncodingSize() == 32) { + static const unsigned Variants[] = {AMDGPUAsmVariants::DEFAULT}; + return makeArrayRef(Variants); + } + + if (isForcedVOP3()) { + static const unsigned Variants[] = {AMDGPUAsmVariants::VOP3}; + return makeArrayRef(Variants); + } + + if (isForcedSDWA()) { + static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA}; + return makeArrayRef(Variants); + } + + if (isForcedDPP()) { + static const unsigned Variants[] = {AMDGPUAsmVariants::DPP}; + return makeArrayRef(Variants); + } + + static const unsigned Variants[] = { + AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3, + AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP + }; + + return makeArrayRef(Variants); +} + bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { MCInst Inst; + unsigned Result = Match_Success; + for (auto Variant : getMatchedVariants()) { + uint64_t EI; + auto R = MatchInstructionImpl(Operands, Inst, EI, MatchingInlineAsm, + Variant); + // We order match statuses from least to most specific. We use most specific + // status as resulting + // Match_MnemonicFail < Match_InvalidOperand < Match_MissingFeature < Match_PreferE32 + if ((R == Match_Success) || + (R == Match_PreferE32) || + (R == Match_MissingFeature && Result != Match_PreferE32) || + (R == Match_InvalidOperand && Result != Match_MissingFeature + && Result != Match_PreferE32) || + (R == Match_MnemonicFail && Result != Match_InvalidOperand + && Result != Match_MissingFeature + && Result != Match_PreferE32)) { + Result = R; + ErrorInfo = EI; + } + if (R == Match_Success) + break; + } - switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { - default: break; - case Match_Success: - Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, getSTI()); - return false; - case Match_MissingFeature: - return Error(IDLoc, "instruction not supported on this GPU"); + switch (Result) { + default: break; + case Match_Success: + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, getSTI()); + return false; - case Match_MnemonicFail: - return Error(IDLoc, "unrecognized instruction mnemonic"); + case Match_MissingFeature: + return Error(IDLoc, "instruction not supported on this GPU"); - case Match_InvalidOperand: { - SMLoc ErrorLoc = IDLoc; - if (ErrorInfo != ~0ULL) { - if (ErrorInfo >= Operands.size()) { - return Error(IDLoc, "too few operands for instruction"); - } - ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc(); - if (ErrorLoc == SMLoc()) - ErrorLoc = IDLoc; + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction mnemonic"); + + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0ULL) { + if (ErrorInfo >= Operands.size()) { + return Error(IDLoc, "too few operands for instruction"); } - return Error(ErrorLoc, "invalid operand for instruction"); + ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; } - case Match_PreferE32: - return Error(IDLoc, "internal error: instruction without _e64 suffix " - "should be encoded as e32"); + return Error(ErrorLoc, "invalid operand for instruction"); + } + + case Match_PreferE32: + return Error(IDLoc, "internal error: instruction without _e64 suffix " + "should be encoded as e32"); } llvm_unreachable("Implement any new match types added!"); } +bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) { + int64_t Tmp = -1; + if (getLexer().isNot(AsmToken::Integer) && getLexer().isNot(AsmToken::Identifier)) { + return true; + } + if (getParser().parseAbsoluteExpression(Tmp)) { + return true; + } + Ret = static_cast<uint32_t>(Tmp); + return false; +} + + bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor) { - if (getLexer().isNot(AsmToken::Integer)) + if (ParseAsAbsoluteExpression(Major)) return TokError("invalid major version"); - Major = getLexer().getTok().getIntVal(); - Lex(); - if (getLexer().isNot(AsmToken::Comma)) return TokError("minor version number required, comma expected"); Lex(); - if (getLexer().isNot(AsmToken::Integer)) + if (ParseAsAbsoluteExpression(Minor)) return TokError("invalid minor version"); - Minor = getLexer().getTok().getIntVal(); - Lex(); - return false; } @@ -1214,7 +1822,6 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() { } bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { - uint32_t Major; uint32_t Minor; uint32_t Stepping; @@ -1231,7 +1838,6 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { return false; } - if (ParseDirectiveMajorMinor(Major, Minor)) return true; @@ -1239,12 +1845,9 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { return TokError("stepping version number required, comma expected"); Lex(); - if (getLexer().isNot(AsmToken::Integer)) + if (ParseAsAbsoluteExpression(Stepping)) return TokError("invalid stepping version"); - Stepping = getLexer().getTok().getIntVal(); - Lex(); - if (getLexer().isNot(AsmToken::Comma)) return TokError("vendor name required, comma expected"); Lex(); @@ -1270,6 +1873,46 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { return false; } +bool AMDGPUAsmParser::ParseDirectiveRuntimeMetadata() { + std::string Metadata; + raw_string_ostream MS(Metadata); + + getLexer().setSkipSpace(false); + + bool FoundEnd = false; + while (!getLexer().is(AsmToken::Eof)) { + while (getLexer().is(AsmToken::Space)) { + MS << ' '; + Lex(); + } + + if (getLexer().is(AsmToken::Identifier)) { + StringRef ID = getLexer().getTok().getIdentifier(); + if (ID == ".end_amdgpu_runtime_metadata") { + Lex(); + FoundEnd = true; + break; + } + } + + MS << Parser.parseStringToEndOfStatement() + << getContext().getAsmInfo()->getSeparatorString(); + + Parser.eatToEndOfStatement(); + } + + getLexer().setSkipSpace(true); + + if (getLexer().is(AsmToken::Eof) && !FoundEnd) + return TokError("expected directive .end_amdgpu_runtime_metadata not found"); + + MS.flush(); + + getTargetStreamer().EmitRuntimeMetadata(Metadata); + + return false; +} + bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header) { SmallString<40> ErrStr; @@ -1282,12 +1925,10 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, } bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { - amd_kernel_code_t Header; AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits()); while (true) { - // Lex EndOfStatement. This is in a while loop, because lexing a comment // will set the current token to EndOfStatement. while(getLexer().is(AsmToken::EndOfStatement)) @@ -1326,6 +1967,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() { getTargetStreamer().EmitAMDGPUSymbolType(KernelName, ELF::STT_AMDGPU_HSA_KERNEL); Lex(); + KernelScope.initialize(getContext()); return false; } @@ -1378,6 +2020,9 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".hsa_code_object_isa") return ParseDirectiveHSACodeObjectISA(); + if (IDVal == ".amdgpu_runtime_metadata") + return ParseDirectiveRuntimeMetadata(); + if (IDVal == ".amd_kernel_code_t") return ParseDirectiveAMDKernelCodeT(); @@ -1433,7 +2078,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, return true; } -AMDGPUAsmParser::OperandMatchResultTy +OperandMatchResultTy AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { // Try to parse with a custom parser @@ -1464,11 +2109,11 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { SMLoc S = Tok.getLoc(); const MCExpr *Expr = nullptr; if (!Parser.parseExpression(Expr)) { - Operands.push_back(AMDGPUOperand::CreateExpr(Expr, S)); + Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); return MatchOperand_Success; } - Operands.push_back(AMDGPUOperand::CreateToken(Tok.getString(), Tok.getLoc())); + Operands.push_back(AMDGPUOperand::CreateToken(this, Tok.getString(), Tok.getLoc())); Parser.Lex(); return MatchOperand_Success; } @@ -1502,10 +2147,10 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, SMLoc NameLoc, OperandVector &Operands) { // Add the instruction mnemonic Name = parseMnemonicSuffix(Name); - Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc)); + Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc)); while (!getLexer().is(AsmToken::EndOfStatement)) { - AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name); + OperandMatchResultTy Res = parseOperand(Operands, Name); // Eat the comma or space if there is one. if (getLexer().is(AsmToken::Comma)) @@ -1535,7 +2180,7 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, // Utility functions //===----------------------------------------------------------------------===// -AMDGPUAsmParser::OperandMatchResultTy +OperandMatchResultTy AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) { switch(getLexer().getKind()) { default: return MatchOperand_NoMatch; @@ -1561,15 +2206,14 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) { return MatchOperand_Success; } -AMDGPUAsmParser::OperandMatchResultTy +OperandMatchResultTy AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, enum AMDGPUOperand::ImmTy ImmTy, bool (*ConvertResult)(int64_t&)) { - SMLoc S = Parser.getTok().getLoc(); int64_t Value = 0; - AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value); + OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value); if (Res != MatchOperand_Success) return Res; @@ -1577,11 +2221,11 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, return MatchOperand_ParseFail; } - Operands.push_back(AMDGPUOperand::CreateImm(Value, S, ImmTy)); + Operands.push_back(AMDGPUOperand::CreateImm(this, Value, S, ImmTy)); return MatchOperand_Success; } -AMDGPUAsmParser::OperandMatchResultTy +OperandMatchResultTy AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, enum AMDGPUOperand::ImmTy ImmTy) { int64_t Bit = 0; @@ -1609,7 +2253,7 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, } } - Operands.push_back(AMDGPUOperand::CreateImm(Bit, S, ImmTy)); + Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy)); return MatchOperand_Success; } @@ -1627,7 +2271,7 @@ void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands, } } -AMDGPUAsmParser::OperandMatchResultTy +OperandMatchResultTy AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) { if (getLexer().isNot(AsmToken::Identifier)) { return MatchOperand_NoMatch; @@ -1657,7 +2301,6 @@ AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) { void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, const OperandVector &Operands) { - OptionalImmIndexMap OptionalIdx; for (unsigned i = 1, e = Operands.size(); i != e; ++i) { @@ -1681,7 +2324,6 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, } void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { - std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; bool GDSOnly = false; @@ -1712,6 +2354,46 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 } +void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + + unsigned EnMask = 0; + int SrcIdx = 0; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + EnMask |= (1 << SrcIdx); + Op.addRegOperands(Inst, 1); + ++SrcIdx; + continue; + } + + if (Op.isOff()) { + ++SrcIdx; + Inst.addOperand(MCOperand::createReg(AMDGPU::NoRegister)); + continue; + } + + if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyExpTgt) { + Op.addImmOperands(Inst, 1); + continue; + } + + if (Op.isToken() && Op.getToken() == "done") + continue; + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpVM); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpCompr); + + Inst.addOperand(MCOperand::createImm(EnMask)); +} //===----------------------------------------------------------------------===// // s_waitcnt @@ -1739,52 +2421,41 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) Parser.Lex(); - int CntShift; - int CntMask; - - if (CntName == "vmcnt") { - CntMask = 0xf; - CntShift = 0; - } else if (CntName == "expcnt") { - CntMask = 0x7; - CntShift = 4; - } else if (CntName == "lgkmcnt") { - CntMask = 0xf; - CntShift = 8; - } else { + IsaVersion IV = getIsaVersion(getSTI().getFeatureBits()); + if (CntName == "vmcnt") + IntVal = encodeVmcnt(IV, IntVal, CntVal); + else if (CntName == "expcnt") + IntVal = encodeExpcnt(IV, IntVal, CntVal); + else if (CntName == "lgkmcnt") + IntVal = encodeLgkmcnt(IV, IntVal, CntVal); + else return true; - } - IntVal &= ~(CntMask << CntShift); - IntVal |= (CntVal << CntShift); return false; } -AMDGPUAsmParser::OperandMatchResultTy +OperandMatchResultTy AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { - // Disable all counters by default. - // vmcnt [3:0] - // expcnt [6:4] - // lgkmcnt [11:8] - int64_t CntVal = 0xf7f; + IsaVersion IV = getIsaVersion(getSTI().getFeatureBits()); + int64_t Waitcnt = getWaitcntBitMask(IV); SMLoc S = Parser.getTok().getLoc(); switch(getLexer().getKind()) { default: return MatchOperand_ParseFail; case AsmToken::Integer: // The operand can be an integer value. - if (getParser().parseAbsoluteExpression(CntVal)) + if (getParser().parseAbsoluteExpression(Waitcnt)) return MatchOperand_ParseFail; break; case AsmToken::Identifier: do { - if (parseCnt(CntVal)) + if (parseCnt(Waitcnt)) return MatchOperand_ParseFail; } while(getLexer().isNot(AsmToken::EndOfStatement)); break; } - Operands.push_back(AMDGPUOperand::CreateImm(CntVal, S)); + Operands.push_back(AMDGPUOperand::CreateImm(this, Waitcnt, S)); return MatchOperand_Success; } @@ -1849,7 +2520,7 @@ bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, return false; } -AMDGPUAsmParser::OperandMatchResultTy +OperandMatchResultTy AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { using namespace llvm::AMDGPU::Hwreg; @@ -1889,7 +2560,7 @@ AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { } break; } - Operands.push_back(AMDGPUOperand::CreateImm(Imm16Val, S, AMDGPUOperand::ImmTyHwreg)); + Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTyHwreg)); return MatchOperand_Success; } @@ -1997,7 +2668,147 @@ bool AMDGPUAsmParser::parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &O return false; } -AMDGPUAsmParser::OperandMatchResultTy +OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) { + if (getLexer().getKind() != AsmToken::Identifier) + return MatchOperand_NoMatch; + + StringRef Str = Parser.getTok().getString(); + int Slot = StringSwitch<int>(Str) + .Case("p10", 0) + .Case("p20", 1) + .Case("p0", 2) + .Default(-1); + + SMLoc S = Parser.getTok().getLoc(); + if (Slot == -1) + return MatchOperand_ParseFail; + + Parser.Lex(); + Operands.push_back(AMDGPUOperand::CreateImm(this, Slot, S, + AMDGPUOperand::ImmTyInterpSlot)); + return MatchOperand_Success; +} + +OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) { + if (getLexer().getKind() != AsmToken::Identifier) + return MatchOperand_NoMatch; + + StringRef Str = Parser.getTok().getString(); + if (!Str.startswith("attr")) + return MatchOperand_NoMatch; + + StringRef Chan = Str.take_back(2); + int AttrChan = StringSwitch<int>(Chan) + .Case(".x", 0) + .Case(".y", 1) + .Case(".z", 2) + .Case(".w", 3) + .Default(-1); + if (AttrChan == -1) + return MatchOperand_ParseFail; + + Str = Str.drop_back(2).drop_front(4); + + uint8_t Attr; + if (Str.getAsInteger(10, Attr)) + return MatchOperand_ParseFail; + + SMLoc S = Parser.getTok().getLoc(); + Parser.Lex(); + if (Attr > 63) { + Error(S, "out of bounds attr"); + return MatchOperand_Success; + } + + SMLoc SChan = SMLoc::getFromPointer(Chan.data()); + + Operands.push_back(AMDGPUOperand::CreateImm(this, Attr, S, + AMDGPUOperand::ImmTyInterpAttr)); + Operands.push_back(AMDGPUOperand::CreateImm(this, AttrChan, SChan, + AMDGPUOperand::ImmTyAttrChan)); + return MatchOperand_Success; +} + +void AMDGPUAsmParser::errorExpTgt() { + Error(Parser.getTok().getLoc(), "invalid exp target"); +} + +OperandMatchResultTy AMDGPUAsmParser::parseExpTgtImpl(StringRef Str, + uint8_t &Val) { + if (Str == "null") { + Val = 9; + return MatchOperand_Success; + } + + if (Str.startswith("mrt")) { + Str = Str.drop_front(3); + if (Str == "z") { // == mrtz + Val = 8; + return MatchOperand_Success; + } + + if (Str.getAsInteger(10, Val)) + return MatchOperand_ParseFail; + + if (Val > 7) + errorExpTgt(); + + return MatchOperand_Success; + } + + if (Str.startswith("pos")) { + Str = Str.drop_front(3); + if (Str.getAsInteger(10, Val)) + return MatchOperand_ParseFail; + + if (Val > 3) + errorExpTgt(); + + Val += 12; + return MatchOperand_Success; + } + + if (Str.startswith("param")) { + Str = Str.drop_front(5); + if (Str.getAsInteger(10, Val)) + return MatchOperand_ParseFail; + + if (Val >= 32) + errorExpTgt(); + + Val += 32; + return MatchOperand_Success; + } + + if (Str.startswith("invalid_target_")) { + Str = Str.drop_front(15); + if (Str.getAsInteger(10, Val)) + return MatchOperand_ParseFail; + + errorExpTgt(); + return MatchOperand_Success; + } + + return MatchOperand_NoMatch; +} + +OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) { + uint8_t Val; + StringRef Str = Parser.getTok().getString(); + + auto Res = parseExpTgtImpl(Str, Val); + if (Res != MatchOperand_Success) + return Res; + + SMLoc S = Parser.getTok().getLoc(); + Parser.Lex(); + + Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, + AMDGPUOperand::ImmTyExpTgt)); + return MatchOperand_Success; +} + +OperandMatchResultTy AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) { using namespace llvm::AMDGPU::SendMsg; @@ -2068,11 +2879,11 @@ AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) { } Imm16Val |= (StreamId << STREAM_ID_SHIFT_); } - } while (0); + } while (false); } break; } - Operands.push_back(AMDGPUOperand::CreateImm(Imm16Val, S, AMDGPUOperand::ImmTySendMsg)); + Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTySendMsg)); return MatchOperand_Success; } @@ -2084,7 +2895,7 @@ bool AMDGPUOperand::isSendMsg() const { // sopp branch targets //===----------------------------------------------------------------------===// -AMDGPUAsmParser::OperandMatchResultTy +OperandMatchResultTy AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); @@ -2094,12 +2905,12 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { int64_t Imm; if (getParser().parseAbsoluteExpression(Imm)) return MatchOperand_ParseFail; - Operands.push_back(AMDGPUOperand::CreateImm(Imm, S)); + Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S)); return MatchOperand_Success; } case AsmToken::Identifier: - Operands.push_back(AMDGPUOperand::CreateExpr( + Operands.push_back(AMDGPUOperand::CreateExpr(this, MCSymbolRefExpr::create(getContext().getOrCreateSymbol( Parser.getTok().getString()), getContext()), S)); Parser.Lex(); @@ -2112,15 +2923,15 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { //===----------------------------------------------------------------------===// AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const { - return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyGLC); + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC); } AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const { - return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTySLC); + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC); } AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const { - return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyTFE); + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyTFE); } void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, @@ -2192,7 +3003,7 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) { } else if (Op.isImmModifier()) { OptionalIdx[Op.getImmTy()] = I; } else { - assert(false); + llvm_unreachable("unexpected operand type"); } } @@ -2228,7 +3039,7 @@ void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) } else if (Op.isImmModifier()) { OptionalIdx[Op.getImmTy()] = I; } else { - assert(false); + llvm_unreachable("unexpected operand type"); } } @@ -2243,48 +3054,53 @@ void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) } AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const { - return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDMask); + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDMask); } AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const { - return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyUNorm); + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyUNorm); } AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const { - return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDA); + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDA); } AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const { - return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyR128); + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyR128); } AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const { - return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyLWE); + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyLWE); } //===----------------------------------------------------------------------===// // smrd //===----------------------------------------------------------------------===// -bool AMDGPUOperand::isSMRDOffset() const { - - // FIXME: Support 20-bit offsets on VI. We need to to pass subtarget - // information here. +bool AMDGPUOperand::isSMRDOffset8() const { return isImm() && isUInt<8>(getImm()); } +bool AMDGPUOperand::isSMRDOffset20() const { + return isImm() && isUInt<20>(getImm()); +} + bool AMDGPUOperand::isSMRDLiteralOffset() const { // 32-bit literals are only supported on CI and we only want to use them // when the offset is > 8-bits. return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm()); } -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset() const { - return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyOffset); +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset8() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset20() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); } AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const { - return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyOffset); + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); } //===----------------------------------------------------------------------===// @@ -2317,10 +3133,13 @@ static bool ConvertBoundCtrl(int64_t &BoundCtrl) { if (BoundCtrl == 0) { BoundCtrl = 1; return true; - } else if (BoundCtrl == -1) { + } + + if (BoundCtrl == -1) { BoundCtrl = 0; return true; } + return false; } @@ -2350,9 +3169,10 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr}, {"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr}, {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr}, + {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr}, }; -AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { +OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { OperandMatchResultTy res; for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) { // try to parse any optional operand here @@ -2376,16 +3196,19 @@ AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(Oper return MatchOperand_NoMatch; } -AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) -{ +OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) { StringRef Name = Parser.getTok().getString(); if (Name == "mul") { - return parseIntWithPrefix("mul", Operands, AMDGPUOperand::ImmTyOModSI, ConvertOmodMul); - } else if (Name == "div") { - return parseIntWithPrefix("div", Operands, AMDGPUOperand::ImmTyOModSI, ConvertOmodDiv); - } else { - return MatchOperand_NoMatch; + return parseIntWithPrefix("mul", Operands, + AMDGPUOperand::ImmTyOModSI, ConvertOmodMul); + } + + if (Name == "div") { + return parseIntWithPrefix("div", Operands, + AMDGPUOperand::ImmTyOModSI, ConvertOmodDiv); } + + return MatchOperand_NoMatch; } void AMDGPUAsmParser::cvtId(MCInst &Inst, const OperandVector &Operands) { @@ -2407,6 +3230,17 @@ void AMDGPUAsmParser::cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands) } } +static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { + // 1. This operand is input modifiers + return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS + // 2. This is not last operand + && Desc.NumOperands > (OpNum + 1) + // 3. Next operand is register class + && Desc.OpInfo[OpNum + 1].RegClass != -1 + // 4. Next register is not tied to any other operand + && Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1; +} + void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; unsigned I = 1; @@ -2417,18 +3251,36 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - if (Op.isRegOrImmWithInputMods()) { - // only fp modifiers allowed in VOP3 + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegOrImmWithFPInputModsOperands(Inst, 2); } else if (Op.isImm()) { OptionalIdx[Op.getImmTy()] = I; } else { - assert(false); + llvm_unreachable("unhandled operand type"); } } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); + + // special case v_mac_{f16, f32}: + // it has src2 register operand that is tied to dst operand + // we don't allow modifiers for this operand in assembler so src2_modifiers + // should be 0 + if (Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_si || + Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || + Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi) { + auto it = Inst.begin(); + std::advance( + it, + AMDGPU::getNamedOperandIdx(Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ? + AMDGPU::V_MAC_F16_e64 : + AMDGPU::V_MAC_F32_e64, + AMDGPU::OpName::src2_modifiers)); + it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2 + ++it; + Inst.insert(it, Inst.getOperand(0)); // src2 = dst + } } //===----------------------------------------------------------------------===// @@ -2455,7 +3307,11 @@ bool AMDGPUOperand::isDPPCtrl() const { return false; } -AMDGPUAsmParser::OperandMatchResultTy +bool AMDGPUOperand::isGPRIdxMode() const { + return isImm() && isUInt<4>(getImm()); +} + +OperandMatchResultTy AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); StringRef Prefix; @@ -2469,8 +3325,10 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { if (Prefix == "row_mirror") { Int = 0x140; + Parser.Lex(); } else if (Prefix == "row_half_mirror") { Int = 0x141; + Parser.Lex(); } else { // Check to prevent parseDPPCtrlOps from eating invalid tokens if (Prefix != "quad_perm" @@ -2494,60 +3352,46 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { Parser.Lex(); if (getLexer().isNot(AsmToken::LBrac)) return MatchOperand_ParseFail; - Parser.Lex(); - if (getLexer().isNot(AsmToken::Integer)) - return MatchOperand_ParseFail; - Int = getLexer().getTok().getIntVal(); - Parser.Lex(); - if (getLexer().isNot(AsmToken::Comma)) - return MatchOperand_ParseFail; - Parser.Lex(); - if (getLexer().isNot(AsmToken::Integer)) + if (getParser().parseAbsoluteExpression(Int) || !(0 <= Int && Int <=3)) return MatchOperand_ParseFail; - Int += (getLexer().getTok().getIntVal() << 2); - Parser.Lex(); - if (getLexer().isNot(AsmToken::Comma)) - return MatchOperand_ParseFail; - Parser.Lex(); - if (getLexer().isNot(AsmToken::Integer)) - return MatchOperand_ParseFail; - Int += (getLexer().getTok().getIntVal() << 4); + for (int i = 0; i < 3; ++i) { + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + Parser.Lex(); - Parser.Lex(); - if (getLexer().isNot(AsmToken::Comma)) - return MatchOperand_ParseFail; - Parser.Lex(); - if (getLexer().isNot(AsmToken::Integer)) - return MatchOperand_ParseFail; - Int += (getLexer().getTok().getIntVal() << 6); + int64_t Temp; + if (getParser().parseAbsoluteExpression(Temp) || !(0 <= Temp && Temp <=3)) + return MatchOperand_ParseFail; + const int shift = i*2 + 2; + Int += (Temp << shift); + } - Parser.Lex(); if (getLexer().isNot(AsmToken::RBrac)) return MatchOperand_ParseFail; + Parser.Lex(); } else { // sel:%d Parser.Lex(); - if (getLexer().isNot(AsmToken::Integer)) + if (getParser().parseAbsoluteExpression(Int)) return MatchOperand_ParseFail; - Int = getLexer().getTok().getIntVal(); - if (Prefix == "row_shl") { + if (Prefix == "row_shl" && 1 <= Int && Int <= 15) { Int |= 0x100; - } else if (Prefix == "row_shr") { + } else if (Prefix == "row_shr" && 1 <= Int && Int <= 15) { Int |= 0x110; - } else if (Prefix == "row_ror") { + } else if (Prefix == "row_ror" && 1 <= Int && Int <= 15) { Int |= 0x120; - } else if (Prefix == "wave_shl") { + } else if (Prefix == "wave_shl" && 1 == Int) { Int = 0x130; - } else if (Prefix == "wave_rol") { + } else if (Prefix == "wave_rol" && 1 == Int) { Int = 0x134; - } else if (Prefix == "wave_shr") { + } else if (Prefix == "wave_shr" && 1 == Int) { Int = 0x138; - } else if (Prefix == "wave_ror") { + } else if (Prefix == "wave_ror" && 1 == Int) { Int = 0x13C; } else if (Prefix == "row_bcast") { if (Int == 15) { @@ -2562,23 +3406,21 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { } } } - Parser.Lex(); // eat last token - Operands.push_back(AMDGPUOperand::CreateImm(Int, S, - AMDGPUOperand::ImmTyDppCtrl)); + Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTyDppCtrl)); return MatchOperand_Success; } AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const { - return AMDGPUOperand::CreateImm(0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask); + return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask); } AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const { - return AMDGPUOperand::CreateImm(0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask); + return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask); } AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBoundCtrl() const { - return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl); + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl); } void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { @@ -2593,9 +3435,12 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments - if (Op.isRegOrImmWithInputMods()) { - // Only float modifiers supported in DPP - Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { + // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token. + // Skip it. + continue; + } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegWithFPInputModsOperands(Inst, 2); } else if (Op.isDPPCtrl()) { Op.addImmOperands(Inst, 1); } else if (Op.isImm()) { @@ -2609,18 +3454,30 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); + + // special case v_mac_{f16, f32}: + // it has src2 register operand that is tied to dst operand + if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp || + Inst.getOpcode() == AMDGPU::V_MAC_F16_dpp) { + auto it = Inst.begin(); + std::advance( + it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); + Inst.insert(it, Inst.getOperand(0)); // src2 = dst + } } //===----------------------------------------------------------------------===// // sdwa //===----------------------------------------------------------------------===// -AMDGPUAsmParser::OperandMatchResultTy +OperandMatchResultTy AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix, AMDGPUOperand::ImmTy Type) { + using namespace llvm::AMDGPU::SDWA; + SMLoc S = Parser.getTok().getLoc(); StringRef Value; - AMDGPUAsmParser::OperandMatchResultTy res; + OperandMatchResultTy res; res = parseStringWithPrefix(Prefix, Value); if (res != MatchOperand_Success) { @@ -2629,13 +3486,13 @@ AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix, int64_t Int; Int = StringSwitch<int64_t>(Value) - .Case("BYTE_0", 0) - .Case("BYTE_1", 1) - .Case("BYTE_2", 2) - .Case("BYTE_3", 3) - .Case("WORD_0", 4) - .Case("WORD_1", 5) - .Case("DWORD", 6) + .Case("BYTE_0", SdwaSel::BYTE_0) + .Case("BYTE_1", SdwaSel::BYTE_1) + .Case("BYTE_2", SdwaSel::BYTE_2) + .Case("BYTE_3", SdwaSel::BYTE_3) + .Case("WORD_0", SdwaSel::WORD_0) + .Case("WORD_1", SdwaSel::WORD_1) + .Case("DWORD", SdwaSel::DWORD) .Default(0xffffffff); Parser.Lex(); // eat last token @@ -2643,15 +3500,17 @@ AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix, return MatchOperand_ParseFail; } - Operands.push_back(AMDGPUOperand::CreateImm(Int, S, Type)); + Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, Type)); return MatchOperand_Success; } -AMDGPUAsmParser::OperandMatchResultTy +OperandMatchResultTy AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) { + using namespace llvm::AMDGPU::SDWA; + SMLoc S = Parser.getTok().getLoc(); StringRef Value; - AMDGPUAsmParser::OperandMatchResultTy res; + OperandMatchResultTy res; res = parseStringWithPrefix("dst_unused", Value); if (res != MatchOperand_Success) { @@ -2660,9 +3519,9 @@ AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) { int64_t Int; Int = StringSwitch<int64_t>(Value) - .Case("UNUSED_PAD", 0) - .Case("UNUSED_SEXT", 1) - .Case("UNUSED_PRESERVE", 2) + .Case("UNUSED_PAD", DstUnused::UNUSED_PAD) + .Case("UNUSED_SEXT", DstUnused::UNUSED_SEXT) + .Case("UNUSED_PRESERVE", DstUnused::UNUSED_PRESERVE) .Default(0xffffffff); Parser.Lex(); // eat last token @@ -2670,8 +3529,7 @@ AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) { return MatchOperand_ParseFail; } - Operands.push_back(AMDGPUOperand::CreateImm(Int, S, - AMDGPUOperand::ImmTySdwaDstUnused)); + Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTySdwaDstUnused)); return MatchOperand_Success; } @@ -2700,13 +3558,15 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments - if (BasicInstType == SIInstrFlags::VOPC && + if ((BasicInstType == SIInstrFlags::VOPC || + BasicInstType == SIInstrFlags::VOP2)&& Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { - // VOPC sdwa use "vcc" token as dst. Skip it. + // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. + // Skip it. continue; - } else if (Op.isRegOrImmWithInputMods()) { - Op.addRegOrImmWithInputModsOperands(Inst, 2); + } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegWithInputModsOperands(Inst, 2); } else if (Op.isImm()) { // Handle optional arguments OptionalIdx[Op.getImmTy()] = I; @@ -2716,46 +3576,55 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - - if (Inst.getOpcode() == AMDGPU::V_NOP_sdwa) { - // V_NOP_sdwa has no optional sdwa arguments - return; - } - switch (BasicInstType) { - case SIInstrFlags::VOP1: { - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); - break; - } - case SIInstrFlags::VOP2: { - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6); - break; - } - case SIInstrFlags::VOPC: { - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6); - break; + + if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { + // V_NOP_sdwa_vi has no optional sdwa arguments + switch (BasicInstType) { + case SIInstrFlags::VOP1: + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); + break; + + case SIInstrFlags::VOP2: + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6); + break; + + case SIInstrFlags::VOPC: + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6); + break; + + default: + llvm_unreachable("Invalid instruction type. Only VOP1, VOP2 and VOPC allowed"); + } } - default: - llvm_unreachable("Invalid instruction type. Only VOP1, VOP2 and VOPC allowed"); + + // special case v_mac_{f16, f32}: + // it has src2 register operand that is tied to dst operand + if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa_vi || + Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) { + auto it = Inst.begin(); + std::advance( + it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); + Inst.insert(it, Inst.getOperand(0)); // src2 = dst } + } /// Force static initialization. extern "C" void LLVMInitializeAMDGPUAsmParser() { - RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget); - RegisterMCAsmParser<AMDGPUAsmParser> B(TheGCNTarget); + RegisterMCAsmParser<AMDGPUAsmParser> A(getTheAMDGPUTarget()); + RegisterMCAsmParser<AMDGPUAsmParser> B(getTheGCNTarget()); } #define GET_REGISTER_MATCHER #define GET_MATCHER_IMPLEMENTATION #include "AMDGPUGenAsmMatcher.inc" - // This fuction should be defined after auto-generated include so that we have // MatchClassKind enum defined unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, @@ -2776,16 +3645,27 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, return Operand.isIdxen() ? Match_Success : Match_InvalidOperand; case MCK_offen: return Operand.isOffen() ? Match_Success : Match_InvalidOperand; - case MCK_SSrc32: + case MCK_SSrcB32: // When operands have expression values, they will return true for isToken, // because it is not possible to distinguish between a token and an // expression at parse time. MatchInstructionImpl() will always try to // match an operand as a token, when isToken returns true, and when the // name of the expression is not a valid token, the match will fail, // so we need to handle it here. - return Operand.isSSrc32() ? Match_Success : Match_InvalidOperand; + return Operand.isSSrcB32() ? Match_Success : Match_InvalidOperand; + case MCK_SSrcF32: + return Operand.isSSrcF32() ? Match_Success : Match_InvalidOperand; case MCK_SoppBrTarget: return Operand.isSoppBrTarget() ? Match_Success : Match_InvalidOperand; - default: return Match_InvalidOperand; + case MCK_VReg32OrOff: + return Operand.isVReg32OrOff() ? Match_Success : Match_InvalidOperand; + case MCK_InterpSlot: + return Operand.isInterpSlot() ? Match_Success : Match_InvalidOperand; + case MCK_Attr: + return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand; + case MCK_AttrChan: + return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand; + default: + return Match_InvalidOperand; } } diff --git a/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td new file mode 100644 index 0000000..45a7fe6 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -0,0 +1,1350 @@ +//===-- BUFInstructions.td - Buffer Instruction Defintions ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; +def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; +def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; + +def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; +def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; +def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; +def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; +def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">; +def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">; + +class MubufLoad <SDPatternOperator op> : PatFrag < + (ops node:$ptr), (op node:$ptr), [{ + auto const AS = cast<MemSDNode>(N)->getAddressSpace(); + return AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; +}]>; + +def mubuf_load : MubufLoad <load>; +def mubuf_az_extloadi8 : MubufLoad <az_extloadi8>; +def mubuf_sextloadi8 : MubufLoad <sextloadi8>; +def mubuf_az_extloadi16 : MubufLoad <az_extloadi16>; +def mubuf_sextloadi16 : MubufLoad <sextloadi16>; +def mubuf_load_atomic : MubufLoad <atomic_load>; + +def BUFAddrKind { + int Offset = 0; + int OffEn = 1; + int IdxEn = 2; + int BothEn = 3; + int Addr64 = 4; +} + +class getAddrName<int addrKind> { + string ret = + !if(!eq(addrKind, BUFAddrKind.Offset), "offset", + !if(!eq(addrKind, BUFAddrKind.OffEn), "offen", + !if(!eq(addrKind, BUFAddrKind.IdxEn), "idxen", + !if(!eq(addrKind, BUFAddrKind.BothEn), "bothen", + !if(!eq(addrKind, BUFAddrKind.Addr64), "addr64", + ""))))); +} + +class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { + bit IsAddr64 = is_addr64; + string OpName = NAME # suffix; +} + +//===----------------------------------------------------------------------===// +// MTBUF classes +//===----------------------------------------------------------------------===// + +class MTBUF_Pseudo <string opName, dag outs, dag ins, + string asmOps, list<dag> pattern=[]> : + InstSI<outs, ins, "", pattern>, + SIMCInstr<opName, SIEncodingFamily.NONE> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + let Size = 8; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + string AsmOperands = asmOps; + + let VM_CNT = 1; + let EXP_CNT = 1; + let MTBUF = 1; + let Uses = [EXEC]; + + let hasSideEffects = 0; + let SchedRW = [WriteVMEM]; +} + +class MTBUF_Real <MTBUF_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + Enc64 { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; + + bits<8> vdata; + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> addr64; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo < + opName, (outs regClass:$dst), + (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, + i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc, + i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset), + " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"# + " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> { + let mayLoad = 1; + let mayStore = 0; +} + +class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo < + opName, (outs), + (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, + i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, + SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset), + " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"# + " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> { + let mayLoad = 0; + let mayStore = 1; +} + +//===----------------------------------------------------------------------===// +// MUBUF classes +//===----------------------------------------------------------------------===// + +class MUBUF_Pseudo <string opName, dag outs, dag ins, + string asmOps, list<dag> pattern=[]> : + InstSI<outs, ins, "", pattern>, + SIMCInstr<opName, SIEncodingFamily.NONE> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + let Size = 8; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + string AsmOperands = asmOps; + + let VM_CNT = 1; + let EXP_CNT = 1; + let MUBUF = 1; + let Uses = [EXEC]; + let hasSideEffects = 0; + let SchedRW = [WriteVMEM]; + + let AsmMatchConverter = "cvtMubuf"; + + bits<1> offen = 0; + bits<1> idxen = 0; + bits<1> addr64 = 0; + bits<1> has_vdata = 1; + bits<1> has_vaddr = 1; + bits<1> has_glc = 1; + bits<1> glc_value = 0; // the value for glc if no such operand + bits<1> has_srsrc = 1; + bits<1> has_soffset = 1; + bits<1> has_offset = 1; + bits<1> has_slc = 1; + bits<1> has_tfe = 1; +} + +class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; + + bits<12> offset; + bits<1> glc; + bits<1> lds = 0; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; +} + + +// For cache invalidation instructions. +class MUBUF_Invalidate <string opName, SDPatternOperator node> : + MUBUF_Pseudo<opName, (outs), (ins), "", [(node)]> { + + let AsmMatchConverter = ""; + + let hasSideEffects = 1; + let mayStore = 1; + + // Set everything to 0. + let offen = 0; + let idxen = 0; + let addr64 = 0; + let has_vdata = 0; + let has_vaddr = 0; + let has_glc = 0; + let glc_value = 0; + let has_srsrc = 0; + let has_soffset = 0; + let has_offset = 0; + let has_slc = 0; + let has_tfe = 0; +} + +class getMUBUFInsDA<list<RegisterClass> vdataList, + list<RegisterClass> vaddrList=[]> { + RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); + RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + dag InsNoData = !if(!empty(vaddrList), + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, + offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe), + (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, + offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe) + ); + dag InsData = !if(!empty(vaddrList), + (ins vdataClass:$vdata, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe), + (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe) + ); + dag ret = !if(!empty(vdataList), InsNoData, InsData); +} + +class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[]> { + dag ret = + !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList>.ret, + !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32]>.ret, + !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32]>.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64]>.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64]>.ret, + (ins)))))); +} + +class getMUBUFAsmOps<int addrKind> { + string Pfx = + !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $soffset", + !if(!eq(addrKind, BUFAddrKind.OffEn), "$vaddr, $srsrc, $soffset offen", + !if(!eq(addrKind, BUFAddrKind.IdxEn), "$vaddr, $srsrc, $soffset idxen", + !if(!eq(addrKind, BUFAddrKind.BothEn), "$vaddr, $srsrc, $soffset idxen offen", + !if(!eq(addrKind, BUFAddrKind.Addr64), "$vaddr, $srsrc, $soffset addr64", + ""))))); + string ret = Pfx # "$offset"; +} + +class MUBUF_SetupAddr<int addrKind> { + bits<1> offen = !if(!eq(addrKind, BUFAddrKind.OffEn), 1, + !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0)); + + bits<1> idxen = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1, + !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0)); + + bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0); + + bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1); +} + +class MUBUF_Load_Pseudo <string opName, + int addrKind, + RegisterClass vdataClass, + list<dag> pattern=[], + // Workaround bug bz30254 + int addrKindCopy = addrKind> + : MUBUF_Pseudo<opName, + (outs vdataClass:$vdata), + getMUBUFIns<addrKindCopy>.ret, + " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe", + pattern>, + MUBUF_SetupAddr<addrKindCopy> { + let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; + let mayLoad = 1; + let mayStore = 0; +} + +// FIXME: tfe can't be an operand because it requires a separate +// opcode because it needs an N+1 register class dest register. +multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, + ValueType load_vt = i32, + SDPatternOperator ld = null_frag> { + + def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + [(set load_vt:$vdata, + (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>, + MUBUFAddr64Table<0>; + + def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + [(set load_vt:$vdata, + (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>, + MUBUFAddr64Table<1>; + + def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + + let DisableWQM = 1 in { + def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; + def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + } +} + +class MUBUF_Store_Pseudo <string opName, + int addrKind, + RegisterClass vdataClass, + list<dag> pattern=[], + // Workaround bug bz30254 + int addrKindCopy = addrKind, + RegisterClass vdataClassCopy = vdataClass> + : MUBUF_Pseudo<opName, + (outs), + getMUBUFIns<addrKindCopy, [vdataClassCopy]>.ret, + " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe", + pattern>, + MUBUF_SetupAddr<addrKindCopy> { + let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; + let mayLoad = 0; + let mayStore = 1; +} + +multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, + ValueType store_vt = i32, + SDPatternOperator st = null_frag> { + + def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>, + MUBUFAddr64Table<0>; + + def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>, + MUBUFAddr64Table<1>; + + def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + + let DisableWQM = 1 in { + def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; + def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + } +} + + +class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, + list<RegisterClass> vaddrList=[]> { + RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + dag ret = !if(vdata_in, + !if(!empty(vaddrList), + (ins vdataClass:$vdata_in, + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc), + (ins vdataClass:$vdata_in, vaddrClass:$vaddr, + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc) + ), + !if(!empty(vaddrList), + (ins vdataClass:$vdata, + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc), + (ins vdataClass:$vdata, vaddrClass:$vaddr, + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc) + )); +} + +class getMUBUFAtomicIns<int addrKind, + RegisterClass vdataClass, + bit vdata_in, + // Workaround bug bz30254 + RegisterClass vdataClassCopy=vdataClass> { + dag ret = + !if(!eq(addrKind, BUFAddrKind.Offset), + getMUBUFAtomicInsDA<vdataClassCopy, vdata_in>.ret, + !if(!eq(addrKind, BUFAddrKind.OffEn), + getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VGPR_32]>.ret, + !if(!eq(addrKind, BUFAddrKind.IdxEn), + getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VGPR_32]>.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), + getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VReg_64]>.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), + getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VReg_64]>.ret, + (ins)))))); +} + +class MUBUF_Atomic_Pseudo<string opName, + int addrKind, + dag outs, + dag ins, + string asmOps, + list<dag> pattern=[], + // Workaround bug bz30254 + int addrKindCopy = addrKind> + : MUBUF_Pseudo<opName, outs, ins, asmOps, pattern>, + MUBUF_SetupAddr<addrKindCopy> { + let mayStore = 1; + let mayLoad = 1; + let hasPostISelHook = 1; + let hasSideEffects = 1; + let DisableWQM = 1; + let has_glc = 0; + let has_tfe = 0; +} + +class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind, + RegisterClass vdataClass, + list<dag> pattern=[], + // Workaround bug bz30254 + int addrKindCopy = addrKind, + RegisterClass vdataClassCopy = vdataClass> + : MUBUF_Atomic_Pseudo<opName, addrKindCopy, + (outs), + getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0>.ret, + " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$slc", + pattern>, + AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 0> { + let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; + let glc_value = 0; + let AsmMatchConverter = "cvtMubufAtomic"; +} + +class MUBUF_AtomicRet_Pseudo<string opName, int addrKind, + RegisterClass vdataClass, + list<dag> pattern=[], + // Workaround bug bz30254 + int addrKindCopy = addrKind, + RegisterClass vdataClassCopy = vdataClass> + : MUBUF_Atomic_Pseudo<opName, addrKindCopy, + (outs vdataClassCopy:$vdata), + getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1>.ret, + " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # " glc$slc", + pattern>, + AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> { + let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret; + let glc_value = 1; + let Constraints = "$vdata = $vdata_in"; + let DisableEncoding = "$vdata_in"; + let AsmMatchConverter = "cvtMubufAtomicReturn"; +} + +multiclass MUBUF_Pseudo_Atomics <string opName, + RegisterClass vdataClass, + ValueType vdataType, + SDPatternOperator atomic> { + + def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>, + MUBUFAddr64Table <0>; + def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>, + MUBUFAddr64Table <1>; + def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + + def _RTN_OFFSET : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + [(set vdataType:$vdata, + (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc), + vdataType:$vdata_in))]>, + MUBUFAddr64Table <0, "_RTN">; + + def _RTN_ADDR64 : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + [(set vdataType:$vdata, + (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), + vdataType:$vdata_in))]>, + MUBUFAddr64Table <1, "_RTN">; + + def _RTN_OFFEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _RTN_IDXEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _RTN_BOTHEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; +} + + +//===----------------------------------------------------------------------===// +// MUBUF Instructions +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isGCN in { + +defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads < + "buffer_load_format_x", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads < + "buffer_load_format_xy", VReg_64 +>; +defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Pseudo_Loads < + "buffer_load_format_xyz", VReg_96 +>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Pseudo_Loads < + "buffer_load_format_xyzw", VReg_128 +>; +defm BUFFER_STORE_FORMAT_X : MUBUF_Pseudo_Stores < + "buffer_store_format_x", VGPR_32 +>; +defm BUFFER_STORE_FORMAT_XY : MUBUF_Pseudo_Stores < + "buffer_store_format_xy", VReg_64 +>; +defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores < + "buffer_store_format_xyz", VReg_96 +>; +defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores < + "buffer_store_format_xyzw", VReg_128 +>; +defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads < + "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8 +>; +defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads < + "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8 +>; +defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads < + "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16 +>; +defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads < + "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16 +>; +defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads < + "buffer_load_dword", VGPR_32, i32, mubuf_load +>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads < + "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load +>; +defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads < + "buffer_load_dwordx3", VReg_96, untyped, mubuf_load +>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads < + "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load +>; +defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores < + "buffer_store_byte", VGPR_32, i32, truncstorei8_global +>; +defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores < + "buffer_store_short", VGPR_32, i32, truncstorei16_global +>; +defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores < + "buffer_store_dword", VGPR_32, i32, global_store +>; +defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores < + "buffer_store_dwordx2", VReg_64, v2i32, global_store +>; +defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores < + "buffer_store_dwordx3", VReg_96, untyped, global_store +>; +defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores < + "buffer_store_dwordx4", VReg_128, v4i32, global_store +>; +defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics < + "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global +>; +defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics < + "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag +>; +defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics < + "buffer_atomic_add", VGPR_32, i32, atomic_add_global +>; +defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics < + "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global +>; +defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics < + "buffer_atomic_smin", VGPR_32, i32, atomic_min_global +>; +defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics < + "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global +>; +defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics < + "buffer_atomic_smax", VGPR_32, i32, atomic_max_global +>; +defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics < + "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global +>; +defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics < + "buffer_atomic_and", VGPR_32, i32, atomic_and_global +>; +defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics < + "buffer_atomic_or", VGPR_32, i32, atomic_or_global +>; +defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics < + "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global +>; +defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics < + "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global +>; +defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics < + "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global +>; +defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global +>; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag +>; +defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global +>; +defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global +>; +defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global +>; +defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global +>; +defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global +>; +defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global +>; +defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global +>; +defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global +>; +defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global +>; +defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global +>; +defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global +>; + +let SubtargetPredicate = isSI in { // isn't on CI & VI +/* +defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">; +defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">; +defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin">; +defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax">; +defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub_x2">; +defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap_x2">; +defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin_x2">; +defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax_x2">; +*/ + +def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc", + int_amdgcn_buffer_wbinvl1_sc>; +} + +def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", + int_amdgcn_buffer_wbinvl1>; + +//===----------------------------------------------------------------------===// +// MTBUF Instructions +//===----------------------------------------------------------------------===// + +//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0, "tbuffer_load_format_x", []>; +//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <1, "tbuffer_load_format_xy", []>; +//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <2, "tbuffer_load_format_xyz", []>; +def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Pseudo <"tbuffer_load_format_xyzw", VReg_128>; +def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>; +def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>; +def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>; +def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>; + +} // End let SubtargetPredicate = isGCN + +let SubtargetPredicate = isCIVI in { + +//===----------------------------------------------------------------------===// +// Instruction definitions for CI and newer. +//===----------------------------------------------------------------------===// +// Remaining instructions: +// BUFFER_LOAD_DWORDX3 +// BUFFER_STORE_DWORDX3 + +def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol", + int_amdgcn_buffer_wbinvl1_vol>; + +} // End let SubtargetPredicate = isCIVI + +//===----------------------------------------------------------------------===// +// MUBUF Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isGCN] in { + +// int_SI_vs_load_input +def : Pat< + (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), + (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0) +>; + +// Offset in an 32-bit VGPR +def : Pat < + (SIload_constant v4i32:$sbase, i32:$voff), + (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0) +>; + + +//===----------------------------------------------------------------------===// +// buffer_load/store_format patterns +//===----------------------------------------------------------------------===// + +multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, + string opcode> { + def : Pat< + (vt (name v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc)), + (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc)), + (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc)), + (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc)), + (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">; + +multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, + string opcode> { + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc), + (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc), + (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc), + (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc), + (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact) + $vdata, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">; +defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">; +defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">; +defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">; +defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">; +defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">; + +//===----------------------------------------------------------------------===// +// buffer_atomic patterns +//===----------------------------------------------------------------------===// + +multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> { + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (!cast<MUBUF_Pseudo>(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $slc)) + >; + + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (!cast<MUBUF_Pseudo>(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $slc)) + >; + + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (!cast<MUBUF_Pseudo>(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $slc)) + >; + + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (!cast<MUBUF_Pseudo>(opcode # _RTN_BOTHEN) + $vdata_in, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) + >; +} + +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_swap, "BUFFER_ATOMIC_SWAP">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_add, "BUFFER_ATOMIC_ADD">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_sub, "BUFFER_ATOMIC_SUB">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smin, "BUFFER_ATOMIC_SMIN">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umin, "BUFFER_ATOMIC_UMIN">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smax, "BUFFER_ATOMIC_SMAX">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umax, "BUFFER_ATOMIC_UMAX">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_and, "BUFFER_ATOMIC_AND">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_or, "BUFFER_ATOMIC_OR">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_xor, "BUFFER_ATOMIC_XOR">; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + + +class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt, + PatFrag constant_ld> : Pat < + (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) + >; + +multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, + ValueType vt, PatFrag atomic_ld> { + def : Pat < + (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) + >; + + def : Pat < + (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) + >; +} + +let Predicates = [isSICI] in { +def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; +def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; +def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; +def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>; + +defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>; +defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>; +} // End Predicates = [isSICI] + +multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, + PatFrag ld> { + + def : Pat < + (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), + (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe) + >; +} + +let Predicates = [Has16BitInsts] in { + +defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, mubuf_sextloadi8>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>; + +} // End Predicates = [Has16BitInsts] + +class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat < + (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))), + (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) +>; + +def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i16, sextloadi8_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i16, extloadi8_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>; + +// BUFFER_LOAD_DWORD*, addr64=0 +multiclass MUBUF_Load_Dword <ValueType vt, + MUBUF_Pseudo offset, + MUBUF_Pseudo offen, + MUBUF_Pseudo idxen, + MUBUF_Pseudo bothen> { + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, + imm:$offset, 0, 0, imm:$glc, imm:$slc, + imm:$tfe)), + (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, + imm:$offset, 1, 0, imm:$glc, imm:$slc, + imm:$tfe)), + (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, + imm:$offset, 0, 1, imm:$glc, imm:$slc, + imm:$tfe)), + (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, + imm:$offset, 1, 1, imm:$glc, imm:$slc, + imm:$tfe)), + (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $tfe)) + >; +} + +defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN, + BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>; +defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN, + BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>; +defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN, + BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>; + +multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, + ValueType vt, PatFrag atomic_st> { + // Store follows atomic op convention so address is forst + def : Pat < + (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc), vt:$val), + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) + >; + + def : Pat < + (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) + >; +} +let Predicates = [isSICI] in { +defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, global_store_atomic>; +defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>; +} // End Predicates = [isSICI] + + +multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, + PatFrag st> { + + def : Pat < + (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe)), + (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe) + >; +} + +defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>; +defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>; + +class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat < + (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, + u16imm:$offset)), + (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) +>; + +def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i16, truncstorei8_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i16, store_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>; + +//===----------------------------------------------------------------------===// +// MTBUF Patterns +//===----------------------------------------------------------------------===// + +// TBUFFER_STORE_FORMAT_*, addr64=0 +class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat< + (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, + i32:$soffset, imm:$inst_offset, imm:$dfmt, + imm:$nfmt, imm:$offen, imm:$idxen, + imm:$glc, imm:$slc, imm:$tfe), + (opcode + $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen), + (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc, + (as_i1imm $slc), (as_i1imm $tfe), $soffset) +>; + +def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>; +def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>; +def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>; +def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; + +} // End let Predicates = [isGCN] + +//===----------------------------------------------------------------------===// +// Target instructions, move to the appropriate target TD file +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SI +//===----------------------------------------------------------------------===// + +class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> : + MUBUF_Real<op, ps>, + Enc64, + SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> { + let AssemblerPredicate=isSICI; + let DecoderNamespace="SICI"; + + let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{15} = ps.addr64; + let Inst{16} = lds; + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{55} = !if(ps.has_tfe, tfe, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); +} + +multiclass MUBUF_Real_AllAddr_si<bits<7> op> { + def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; + def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>; + def _OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>; + def _IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>; + def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; +} + +multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> { + def _RTN_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFSET")>; + def _RTN_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_ADDR64")>; + def _RTN_OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFEN")>; + def _RTN_IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_IDXEN")>; + def _RTN_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_BOTHEN")>; +} + +defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_si <0x00>; +defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_si <0x01>; +defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x02>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x03>; +defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_si <0x04>; +defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_si <0x05>; +defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x06>; +defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x07>; +defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_si <0x08>; +defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_si <0x09>; +defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_si <0x0a>; +defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_si <0x0b>; +defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_si <0x0c>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_si <0x0d>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_si <0x0e>; +defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_si <0x0f>; +defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_si <0x18>; +defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_si <0x1a>; +defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_si <0x1c>; +defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_si <0x1d>; +defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_si <0x1e>; +defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_si <0x1f>; + +defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_si <0x30>; +defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_si <0x31>; +defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_si <0x32>; +defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_si <0x33>; +//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomic_si <0x34>; // isn't on CI & VI +defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_si <0x35>; +defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_si <0x36>; +defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_si <0x37>; +defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_si <0x38>; +defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_si <0x39>; +defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_si <0x3a>; +defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_si <0x3b>; +defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_si <0x3c>; +defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_si <0x3d>; + +//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_si <0x3e>; // isn't on VI +//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomic_si <0x3f>; // isn't on VI +//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomic_si <0x40>; // isn't on VI +defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_si <0x50>; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_si <0x51>; +defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_si <0x52>; +defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_si <0x53>; +//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Real_Atomic_si <0x54>; // isn't on CI & VI +defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_si <0x55>; +defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_si <0x56>; +defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_si <0x57>; +defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_si <0x58>; +defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_si <0x59>; +defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_si <0x5a>; +defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>; +defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>; +defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>; +// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI. +//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI +//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI +//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI + +def BUFFER_WBINVL1_SC_si : MUBUF_Real_si <0x70, BUFFER_WBINVL1_SC>; +def BUFFER_WBINVL1_si : MUBUF_Real_si <0x71, BUFFER_WBINVL1>; + +class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> : + MTBUF_Real<ps>, + SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> { + let AssemblerPredicate=isSICI; + let DecoderNamespace="SICI"; + + bits<1> addr64; + let Inst{15} = addr64; + let Inst{18-16} = op; +} + +def TBUFFER_LOAD_FORMAT_XYZW_si : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>; +def TBUFFER_STORE_FORMAT_X_si : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>; +def TBUFFER_STORE_FORMAT_XY_si : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>; +def TBUFFER_STORE_FORMAT_XYZ_si : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>; +def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>; + + +//===----------------------------------------------------------------------===// +// CI +//===----------------------------------------------------------------------===// + +class MUBUF_Real_ci <bits<7> op, MUBUF_Pseudo ps> : + MUBUF_Real_si<op, ps> { + let AssemblerPredicate=isCIOnly; + let DecoderNamespace="CI"; +} + +def BUFFER_WBINVL1_VOL_ci : MUBUF_Real_ci <0x70, BUFFER_WBINVL1_VOL>; + + +//===----------------------------------------------------------------------===// +// VI +//===----------------------------------------------------------------------===// + +class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> : + MUBUF_Real<op, ps>, + Enc64, + SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> { + let AssemblerPredicate=isVI; + let DecoderNamespace="VI"; + + let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{16} = lds; + let Inst{17} = !if(ps.has_slc, slc, ?); + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{55} = !if(ps.has_tfe, tfe, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); +} + +multiclass MUBUF_Real_AllAddr_vi<bits<7> op> { + def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; + def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>; + def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>; + def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; +} + +multiclass MUBUF_Real_Atomic_vi<bits<7> op> : + MUBUF_Real_AllAddr_vi<op> { + def _RTN_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFSET")>; + def _RTN_OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFEN")>; + def _RTN_IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_IDXEN")>; + def _RTN_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_BOTHEN")>; +} + +defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_vi <0x00>; +defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x01>; +defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x02>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x03>; +defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_vi <0x04>; +defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x05>; +defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x06>; +defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x07>; +defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_vi <0x10>; +defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_vi <0x11>; +defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_vi <0x12>; +defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_vi <0x13>; +defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_vi <0x14>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>; +defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>; +defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>; +defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>; +defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_vi <0x1c>; +defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_vi <0x1d>; +defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_vi <0x1e>; +defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_vi <0x1f>; + +defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_vi <0x40>; +defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_vi <0x41>; +defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_vi <0x42>; +defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_vi <0x43>; +defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_vi <0x44>; +defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_vi <0x45>; +defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_vi <0x46>; +defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_vi <0x47>; +defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_vi <0x48>; +defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_vi <0x49>; +defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_vi <0x4a>; +defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_vi <0x4b>; +defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_vi <0x4c>; + +defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_vi <0x60>; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_vi <0x61>; +defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_vi <0x62>; +defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_vi <0x63>; +defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_vi <0x64>; +defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_vi <0x65>; +defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_vi <0x66>; +defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_vi <0x67>; +defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_vi <0x68>; +defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_vi <0x69>; +defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_vi <0x6a>; +defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_vi <0x6b>; +defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_vi <0x6c>; + +def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>; +def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; + +class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> : + MTBUF_Real<ps>, + SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> { + let AssemblerPredicate=isVI; + let DecoderNamespace="VI"; + + let Inst{18-15} = op; +} + +def TBUFFER_LOAD_FORMAT_XYZW_vi : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>; +def TBUFFER_STORE_FORMAT_X_vi : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>; +def TBUFFER_STORE_FORMAT_XY_vi : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>; +def TBUFFER_STORE_FORMAT_XYZ_vi : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>; +def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>; + diff --git a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td index f9a9f79..26a483a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td @@ -12,338 +12,4 @@ // S_CBRANCH_CDBGUSER // S_CBRANCH_CDBGSYS // S_CBRANCH_CDBGSYS_OR_USER -// S_CBRANCH_CDBGSYS_AND_USER -// DS_NOP -// DS_GWS_SEMA_RELEASE_ALL -// DS_WRAP_RTN_B32 -// DS_CNDXCHG32_RTN_B64 -// DS_WRITE_B96 -// DS_WRITE_B128 -// DS_CONDXCHG32_RTN_B128 -// DS_READ_B96 -// DS_READ_B128 -// BUFFER_LOAD_DWORDX3 -// BUFFER_STORE_DWORDX3 - -//===----------------------------------------------------------------------===// -// VOP1 Instructions -//===----------------------------------------------------------------------===// - -let SubtargetPredicate = isCIVI in { - -let SchedRW = [WriteDoubleAdd] in { -defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64", - VOP_F64_F64, ftrunc ->; -defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64", - VOP_F64_F64, fceil ->; -defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64", - VOP_F64_F64, ffloor ->; -defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64", - VOP_F64_F64, frint ->; -} // End SchedRW = [WriteDoubleAdd] - -let SchedRW = [WriteQuarterRate32] in { -defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32", - VOP_F32_F32 ->; -defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32", - VOP_F32_F32 ->; -} // End SchedRW = [WriteQuarterRate32] - -//===----------------------------------------------------------------------===// -// VOP3 Instructions -//===----------------------------------------------------------------------===// - -defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8", - VOP_I32_I32_I32 ->; -defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8", - VOP_I32_I32_I32 ->; -defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8", - VOP_I32_I32_I32 ->; - -let isCommutable = 1 in { -defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32", - VOP_I64_I32_I32_I64 ->; - -// XXX - Does this set VCC? -defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", - VOP_I64_I32_I32_I64 ->; -} // End isCommutable = 1 - - -//===----------------------------------------------------------------------===// -// DS Instructions -//===----------------------------------------------------------------------===// -defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; - -// DS_CONDXCHG32_RTN_B64 -// DS_CONDXCHG32_RTN_B128 - -//===----------------------------------------------------------------------===// -// SMRD Instructions -//===----------------------------------------------------------------------===// - -defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>, - "s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>; - -//===----------------------------------------------------------------------===// -// MUBUF Instructions -//===----------------------------------------------------------------------===// - -let DisableSIDecoder = 1 in { -defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>, - "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol ->; -} - -//===----------------------------------------------------------------------===// -// Flat Instructions -//===----------------------------------------------------------------------===// - -defm FLAT_LOAD_UBYTE : FLAT_Load_Helper < - flat<0x8, 0x10>, "flat_load_ubyte", VGPR_32 ->; -defm FLAT_LOAD_SBYTE : FLAT_Load_Helper < - flat<0x9, 0x11>, "flat_load_sbyte", VGPR_32 ->; -defm FLAT_LOAD_USHORT : FLAT_Load_Helper < - flat<0xa, 0x12>, "flat_load_ushort", VGPR_32 ->; -defm FLAT_LOAD_SSHORT : FLAT_Load_Helper < - flat<0xb, 0x13>, "flat_load_sshort", VGPR_32> -; -defm FLAT_LOAD_DWORD : FLAT_Load_Helper < - flat<0xc, 0x14>, "flat_load_dword", VGPR_32 ->; -defm FLAT_LOAD_DWORDX2 : FLAT_Load_Helper < - flat<0xd, 0x15>, "flat_load_dwordx2", VReg_64 ->; -defm FLAT_LOAD_DWORDX4 : FLAT_Load_Helper < - flat<0xe, 0x17>, "flat_load_dwordx4", VReg_128 ->; -defm FLAT_LOAD_DWORDX3 : FLAT_Load_Helper < - flat<0xf, 0x16>, "flat_load_dwordx3", VReg_96 ->; -defm FLAT_STORE_BYTE : FLAT_Store_Helper < - flat<0x18>, "flat_store_byte", VGPR_32 ->; -defm FLAT_STORE_SHORT : FLAT_Store_Helper < - flat <0x1a>, "flat_store_short", VGPR_32 ->; -defm FLAT_STORE_DWORD : FLAT_Store_Helper < - flat<0x1c>, "flat_store_dword", VGPR_32 ->; -defm FLAT_STORE_DWORDX2 : FLAT_Store_Helper < - flat<0x1d>, "flat_store_dwordx2", VReg_64 ->; -defm FLAT_STORE_DWORDX4 : FLAT_Store_Helper < - flat<0x1e, 0x1f>, "flat_store_dwordx4", VReg_128 ->; -defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper < - flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96 ->; -defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC < - flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32, i32, atomic_swap_flat ->; -defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC < - flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, i32, - atomic_cmp_swap_flat, v2i32, VReg_64 ->; -defm FLAT_ATOMIC_ADD : FLAT_ATOMIC < - flat<0x32, 0x42>, "flat_atomic_add", VGPR_32, i32, atomic_add_flat ->; -defm FLAT_ATOMIC_SUB : FLAT_ATOMIC < - flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32, i32, atomic_sub_flat ->; -defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC < - flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32, i32, atomic_min_flat ->; -defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC < - flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32, i32, atomic_umin_flat ->; -defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC < - flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32, i32, atomic_max_flat ->; -defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC < - flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32, i32, atomic_umax_flat ->; -defm FLAT_ATOMIC_AND : FLAT_ATOMIC < - flat<0x39, 0x48>, "flat_atomic_and", VGPR_32, i32, atomic_and_flat ->; -defm FLAT_ATOMIC_OR : FLAT_ATOMIC < - flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32, i32, atomic_or_flat ->; -defm FLAT_ATOMIC_XOR : FLAT_ATOMIC < - flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32, i32, atomic_xor_flat ->; -defm FLAT_ATOMIC_INC : FLAT_ATOMIC < - flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32, i32, atomic_inc_flat ->; -defm FLAT_ATOMIC_DEC : FLAT_ATOMIC < - flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32, i32, atomic_dec_flat ->; -defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC < - flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64, i64, atomic_swap_flat ->; -defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC < - flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, i64, - atomic_cmp_swap_flat, v2i64, VReg_128 ->; -defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC < - flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64, i64, atomic_add_flat ->; -defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC < - flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64, i64, atomic_sub_flat ->; -defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC < - flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64, i64, atomic_min_flat ->; -defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC < - flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64, i64, atomic_umin_flat ->; -defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC < - flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64, i64, atomic_max_flat ->; -defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC < - flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64, i64, atomic_umax_flat ->; -defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC < - flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64, i64, atomic_and_flat ->; -defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC < - flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64, i64, atomic_or_flat ->; -defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC < - flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64, i64, atomic_xor_flat ->; -defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC < - flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64, i64, atomic_inc_flat ->; -defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC < - flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64, i64, atomic_dec_flat ->; - -} // End SubtargetPredicate = isCIVI - -// CI Only flat instructions - -let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst, DisableVIDecoder = 1 in { - -defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < - flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, f32, - null_frag, v2f32, VReg_64 ->; -defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC < - flat<0x3f>, "flat_atomic_fmin", VGPR_32, f32 ->; -defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC < - flat<0x40>, "flat_atomic_fmax", VGPR_32, f32 ->; -defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < - flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, f64, - null_frag, v2f64, VReg_128 ->; -defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC < - flat<0x5f>, "flat_atomic_fmin_x2", VReg_64, f64 ->; -defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC < - flat<0x60>, "flat_atomic_fmax_x2", VReg_64, f64 ->; - -} // End SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst, DisableVIDecoder = 1 - -//===----------------------------------------------------------------------===// -// Flat Patterns -//===----------------------------------------------------------------------===// - -let Predicates = [isCIVI] in { - -// Patterns for global loads with no offset. -class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr)), - (inst $addr, 0, 0, 0) ->; - -class FlatLoadAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr)), - (inst $addr, 1, 0, 0) ->; - -def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>; - -def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_flat_load, i32>; -def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_flat_load, i64>; - - -class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (node vt:$data, i64:$addr), - (inst $addr, $data, 0, 0, 0) ->; - -class FlatStoreAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - // atomic store follows atomic binop convention so the address comes - // first. - (node i64:$addr, vt:$data), - (inst $addr, $data, 1, 0, 0) ->; - -def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>; -def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>; -def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>; - -def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_flat_store, i32>; -def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_flat_store, i64>; - -class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt, - ValueType data_vt = vt> : Pat < - (vt (node i64:$addr, data_vt:$data)), - (inst $addr, $data, 0, 0) ->; - -def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, atomic_cmp_swap_global, i32, v2i32>; -def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; - -def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, atomic_cmp_swap_global, i64, v2i64>; -def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; - -} // End Predicates = [isCIVI] +// S_CBRANCH_CDBGSYS_AND_USER
\ No newline at end of file diff --git a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td index 98bc6e8..6b8e85a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -37,6 +37,9 @@ def MULLO_INT_cm : MULLO_INT_Common<0x8F>; def MULHI_INT_cm : MULHI_INT_Common<0x90>; def MULLO_UINT_cm : MULLO_UINT_Common<0x91>; def MULHI_UINT_cm : MULHI_UINT_Common<0x92>; +def MULHI_INT_cm24 : MULHI_INT24_Common<0x5c>; +def MULHI_UINT_cm24 : MULHI_UINT24_Common<0xb2>; + def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>; def EXP_IEEE_cm : EXP_IEEE_Common<0x81>; def LOG_IEEE_cm : LOG_IEEE_Common<0x83>; @@ -85,14 +88,13 @@ def RAT_STORE_TYPED_cm: CF_MEM_RAT_STORE_TYPED<0> { let eop = 0; // This bit is not used on Cayman. } -class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern> - : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> { +class VTX_READ_cm <string name, dag outs> + : VTX_WORD0_cm, VTX_READ<name, outs, []> { // Static fields let VC_INST = 0; let FETCH_TYPE = 2; let FETCH_WHOLE_QUAD = 0; - let BUFFER_ID = buffer_id; let SRC_REL = 0; // XXX: We can infer this field based on the SRC_GPR. This would allow us // to store vertex addresses in any channel, not just X. @@ -105,9 +107,9 @@ class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern> let Inst{31-0} = Word0; } -class VTX_READ_8_cm <bits<8> buffer_id, list<dag> pattern> - : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { +def VTX_READ_8_cm + : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", + (outs R600_TReg32_X:$dst_gpr)> { let DST_SEL_X = 0; let DST_SEL_Y = 7; // Masked @@ -116,9 +118,9 @@ class VTX_READ_8_cm <bits<8> buffer_id, list<dag> pattern> let DATA_FORMAT = 1; // FMT_8 } -class VTX_READ_16_cm <bits<8> buffer_id, list<dag> pattern> - : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { +def VTX_READ_16_cm + : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", + (outs R600_TReg32_X:$dst_gpr)> { let DST_SEL_X = 0; let DST_SEL_Y = 7; // Masked let DST_SEL_Z = 7; // Masked @@ -127,9 +129,9 @@ class VTX_READ_16_cm <bits<8> buffer_id, list<dag> pattern> } -class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern> - : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { +def VTX_READ_32_cm + : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", + (outs R600_TReg32_X:$dst_gpr)> { let DST_SEL_X = 0; let DST_SEL_Y = 7; // Masked @@ -147,9 +149,9 @@ class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern> let Constraints = "$src_gpr.ptr = $dst_gpr"; } -class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern> - : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id, - (outs R600_Reg64:$dst_gpr), pattern> { +def VTX_READ_64_cm + : VTX_READ_cm <"VTX_READ_64 $dst_gpr.XY, $src_gpr", + (outs R600_Reg64:$dst_gpr)> { let DST_SEL_X = 0; let DST_SEL_Y = 1; @@ -158,9 +160,9 @@ class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern> let DATA_FORMAT = 0x1D; // COLOR_32_32 } -class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern> - : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, - (outs R600_Reg128:$dst_gpr), pattern> { +def VTX_READ_128_cm + : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", + (outs R600_Reg128:$dst_gpr)> { let DST_SEL_X = 0; let DST_SEL_Y = 1; @@ -177,79 +179,44 @@ class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern> //===----------------------------------------------------------------------===// // VTX Read from parameter memory space //===----------------------------------------------------------------------===// -def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0, - [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0, - [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0, - [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0, - [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; +def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)), + (VTX_READ_8_cm MEMxi:$src_gpr, 3)>; +def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)), + (VTX_READ_16_cm MEMxi:$src_gpr, 3)>; +def : Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_32_cm MEMxi:$src_gpr, 3)>; +def : Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_64_cm MEMxi:$src_gpr, 3)>; +def : Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_128_cm MEMxi:$src_gpr, 3)>; -def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0, - [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; +//===----------------------------------------------------------------------===// +// VTX Read from constant memory space +//===----------------------------------------------------------------------===// +def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)), + (VTX_READ_8_cm MEMxi:$src_gpr, 2)>; +def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)), + (VTX_READ_16_cm MEMxi:$src_gpr, 2)>; +def : Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_32_cm MEMxi:$src_gpr, 2)>; +def : Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_64_cm MEMxi:$src_gpr, 2)>; +def : Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_128_cm MEMxi:$src_gpr, 2)>; //===----------------------------------------------------------------------===// // VTX Read from global memory space //===----------------------------------------------------------------------===// - -// 8-bit reads -def VTX_READ_ID1_8_cm : VTX_READ_8_cm <1, - [(set i32:$dst_gpr, (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr))] ->; - -// 16-bit reads -def VTX_READ_ID1_16_cm : VTX_READ_16_cm <1, - [(set i32:$dst_gpr, (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr))] ->; - -// 32-bit reads -def VTX_READ_ID1_32_cm : VTX_READ_32_cm <1, - [(set i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] ->; - -// 64-bit reads -def VTX_READ_ID1_64_cm : VTX_READ_64_cm <1, - [(set v2i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] ->; - -// 128-bit reads -def VTX_READ_ID1_128_cm : VTX_READ_128_cm <1, - [(set v4i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] ->; - -// 8-bit reads -def VTX_READ_ID2_8_cm : VTX_READ_8_cm <2, - [(set i32:$dst_gpr, (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr))] ->; - -// 16-bit reads -def VTX_READ_ID2_16_cm : VTX_READ_16_cm <2, - [(set i32:$dst_gpr, (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr))] ->; - -// 32-bit reads -def VTX_READ_ID2_32_cm : VTX_READ_32_cm <2, - [(set i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] ->; - -// 64-bit reads -def VTX_READ_ID2_64_cm : VTX_READ_64_cm <2, - [(set v2i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] ->; - -// 128-bit reads -def VTX_READ_ID2_128_cm : VTX_READ_128_cm <2, - [(set v4i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] ->; +def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)), + (VTX_READ_8_cm MEMxi:$src_gpr, 1)>; +def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)), + (VTX_READ_16_cm MEMxi:$src_gpr, 1)>; +def : Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_32_cm MEMxi:$src_gpr, 1)>; +def : Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_64_cm MEMxi:$src_gpr, 1)>; +def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_128_cm MEMxi:$src_gpr, 1)>; } // End isCayman diff --git a/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td new file mode 100644 index 0000000..a077001 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -0,0 +1,906 @@ +//===-- DSInstructions.td - DS Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> : + InstSI <outs, ins, "", pattern>, + SIMCInstr <opName, SIEncodingFamily.NONE> { + + let SubtargetPredicate = isGCN; + + let LGKM_CNT = 1; + let DS = 1; + let Size = 8; + let UseNamedOperandTable = 1; + let Uses = [M0, EXEC]; + + // Most instruction load and store data, so set this as the default. + let mayLoad = 1; + let mayStore = 1; + + let hasSideEffects = 0; + let SchedRW = [WriteLDS]; + + let isPseudo = 1; + let isCodeGenOnly = 1; + + let AsmMatchConverter = "cvtDS"; + + string Mnemonic = opName; + string AsmOperands = asmOps; + + // Well these bits a kind of hack because it would be more natural + // to test "outs" and "ins" dags for the presence of particular operands + bits<1> has_vdst = 1; + bits<1> has_addr = 1; + bits<1> has_data0 = 1; + bits<1> has_data1 = 1; + + bits<1> has_offset = 1; // has "offset" that should be split to offset0,1 + bits<1> has_offset0 = 1; + bits<1> has_offset1 = 1; + + bits<1> has_gds = 1; + bits<1> gdsValue = 0; // if has_gds == 0 set gds to this value +} + +class DS_Real <DS_Pseudo ds> : + InstSI <ds.OutOperandList, ds.InOperandList, ds.Mnemonic # " " # ds.AsmOperands, []>, + Enc64 { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + // copy relevant pseudo op flags + let SubtargetPredicate = ds.SubtargetPredicate; + let AsmMatchConverter = ds.AsmMatchConverter; + + // encoding fields + bits<8> vdst; + bits<1> gds; + bits<8> addr; + bits<8> data0; + bits<8> data1; + bits<8> offset0; + bits<8> offset1; + + bits<16> offset; + let offset0 = !if(ds.has_offset, offset{7-0}, ?); + let offset1 = !if(ds.has_offset, offset{15-8}, ?); +} + + +// DS Pseudo instructions + +class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32> +: DS_Pseudo<opName, + (outs), + (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds), + "$addr, $data0$offset$gds">, + AtomicNoRet<opName, 0> { + + let has_data1 = 0; + let has_vdst = 0; +} + +class DS_1A_Off8_NORET<string opName> : DS_Pseudo<opName, + (outs), + (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds), + "$addr $offset0$offset1$gds"> { + + let has_data0 = 0; + let has_data1 = 0; + let has_vdst = 0; + let has_offset = 0; + let AsmMatchConverter = "cvtDSOffset01"; +} + +class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32> +: DS_Pseudo<opName, + (outs), + (ins VGPR_32:$addr, rc:$data0, rc:$data1, offset:$offset, gds:$gds), + "$addr, $data0, $data1"#"$offset"#"$gds">, + AtomicNoRet<opName, 0> { + + let has_vdst = 0; +} + +class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32> +: DS_Pseudo<opName, + (outs), + (ins VGPR_32:$addr, rc:$data0, rc:$data1, + offset0:$offset0, offset1:$offset1, gds:$gds), + "$addr, $data0, $data1$offset0$offset1$gds"> { + + let has_vdst = 0; + let has_offset = 0; + let AsmMatchConverter = "cvtDSOffset01"; +} + +class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32> +: DS_Pseudo<opName, + (outs rc:$vdst), + (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds), + "$vdst, $addr, $data0$offset$gds"> { + + let hasPostISelHook = 1; + let has_data1 = 0; +} + +class DS_1A2D_RET<string opName, + RegisterClass rc = VGPR_32, + RegisterClass src = rc> +: DS_Pseudo<opName, + (outs rc:$vdst), + (ins VGPR_32:$addr, src:$data0, src:$data1, offset:$offset, gds:$gds), + "$vdst, $addr, $data0, $data1$offset$gds"> { + + let hasPostISelHook = 1; +} + +class DS_1A_RET<string opName, RegisterClass rc = VGPR_32> +: DS_Pseudo<opName, + (outs rc:$vdst), + (ins VGPR_32:$addr, offset:$offset, gds:$gds), + "$vdst, $addr$offset$gds"> { + + let has_data0 = 0; + let has_data1 = 0; +} + +class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32> +: DS_Pseudo<opName, + (outs rc:$vdst), + (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds), + "$vdst, $addr$offset0$offset1$gds"> { + + let has_offset = 0; + let has_data0 = 0; + let has_data1 = 0; + let AsmMatchConverter = "cvtDSOffset01"; +} + +class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName, + (outs VGPR_32:$vdst), + (ins VGPR_32:$addr, offset:$offset), + "$vdst, $addr$offset gds"> { + + let has_data0 = 0; + let has_data1 = 0; + let has_gds = 0; + let gdsValue = 1; +} + +class DS_0A_RET <string opName> : DS_Pseudo<opName, + (outs VGPR_32:$vdst), + (ins offset:$offset, gds:$gds), + "$vdst$offset$gds"> { + + let mayLoad = 1; + let mayStore = 1; + + let has_addr = 0; + let has_data0 = 0; + let has_data1 = 0; +} + +class DS_1A <string opName> : DS_Pseudo<opName, + (outs), + (ins VGPR_32:$addr, offset:$offset, gds:$gds), + "$addr$offset$gds"> { + + let mayLoad = 1; + let mayStore = 1; + + let has_vdst = 0; + let has_data0 = 0; + let has_data1 = 0; +} + +class DS_1A_GDS <string opName> : DS_Pseudo<opName, + (outs), + (ins VGPR_32:$addr), + "$addr gds"> { + + let has_vdst = 0; + let has_data0 = 0; + let has_data1 = 0; + let has_offset = 0; + let has_offset0 = 0; + let has_offset1 = 0; + + let has_gds = 0; + let gdsValue = 1; +} + +class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag> +: DS_Pseudo<opName, + (outs VGPR_32:$vdst), + (ins VGPR_32:$addr, VGPR_32:$data0, offset:$offset), + "$vdst, $addr, $data0$offset", + [(set i32:$vdst, + (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > { + + let mayLoad = 0; + let mayStore = 0; + let isConvergent = 1; + + let has_data1 = 0; + let has_gds = 0; +} + +def DS_ADD_U32 : DS_1A1D_NORET<"ds_add_u32">; +def DS_SUB_U32 : DS_1A1D_NORET<"ds_sub_u32">; +def DS_RSUB_U32 : DS_1A1D_NORET<"ds_rsub_u32">; +def DS_INC_U32 : DS_1A1D_NORET<"ds_inc_u32">; +def DS_DEC_U32 : DS_1A1D_NORET<"ds_dec_u32">; +def DS_MIN_I32 : DS_1A1D_NORET<"ds_min_i32">; +def DS_MAX_I32 : DS_1A1D_NORET<"ds_max_i32">; +def DS_MIN_U32 : DS_1A1D_NORET<"ds_min_u32">; +def DS_MAX_U32 : DS_1A1D_NORET<"ds_max_u32">; +def DS_AND_B32 : DS_1A1D_NORET<"ds_and_b32">; +def DS_OR_B32 : DS_1A1D_NORET<"ds_or_b32">; +def DS_XOR_B32 : DS_1A1D_NORET<"ds_xor_b32">; +def DS_ADD_F32 : DS_1A1D_NORET<"ds_add_f32">; +def DS_MIN_F32 : DS_1A1D_NORET<"ds_min_f32">; +def DS_MAX_F32 : DS_1A1D_NORET<"ds_max_f32">; + +let mayLoad = 0 in { +def DS_WRITE_B8 : DS_1A1D_NORET<"ds_write_b8">; +def DS_WRITE_B16 : DS_1A1D_NORET<"ds_write_b16">; +def DS_WRITE_B32 : DS_1A1D_NORET<"ds_write_b32">; +def DS_WRITE2_B32 : DS_1A2D_Off8_NORET<"ds_write2_b32">; +def DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET<"ds_write2st64_b32">; +} + +def DS_MSKOR_B32 : DS_1A2D_NORET<"ds_mskor_b32">; +def DS_CMPST_B32 : DS_1A2D_NORET<"ds_cmpst_b32">; +def DS_CMPST_F32 : DS_1A2D_NORET<"ds_cmpst_f32">; + +def DS_ADD_U64 : DS_1A1D_NORET<"ds_add_u64", VReg_64>; +def DS_SUB_U64 : DS_1A1D_NORET<"ds_sub_u64", VReg_64>; +def DS_RSUB_U64 : DS_1A1D_NORET<"ds_rsub_u64", VReg_64>; +def DS_INC_U64 : DS_1A1D_NORET<"ds_inc_u64", VReg_64>; +def DS_DEC_U64 : DS_1A1D_NORET<"ds_dec_u64", VReg_64>; +def DS_MIN_I64 : DS_1A1D_NORET<"ds_min_i64", VReg_64>; +def DS_MAX_I64 : DS_1A1D_NORET<"ds_max_i64", VReg_64>; +def DS_MIN_U64 : DS_1A1D_NORET<"ds_min_u64", VReg_64>; +def DS_MAX_U64 : DS_1A1D_NORET<"ds_max_u64", VReg_64>; +def DS_AND_B64 : DS_1A1D_NORET<"ds_and_b64", VReg_64>; +def DS_OR_B64 : DS_1A1D_NORET<"ds_or_b64", VReg_64>; +def DS_XOR_B64 : DS_1A1D_NORET<"ds_xor_b64", VReg_64>; +def DS_MSKOR_B64 : DS_1A2D_NORET<"ds_mskor_b64", VReg_64>; +let mayLoad = 0 in { +def DS_WRITE_B64 : DS_1A1D_NORET<"ds_write_b64", VReg_64>; +def DS_WRITE2_B64 : DS_1A2D_Off8_NORET<"ds_write2_b64", VReg_64>; +def DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET<"ds_write2st64_b64", VReg_64>; +} +def DS_CMPST_B64 : DS_1A2D_NORET<"ds_cmpst_b64", VReg_64>; +def DS_CMPST_F64 : DS_1A2D_NORET<"ds_cmpst_f64", VReg_64>; +def DS_MIN_F64 : DS_1A1D_NORET<"ds_min_f64", VReg_64>; +def DS_MAX_F64 : DS_1A1D_NORET<"ds_max_f64", VReg_64>; + +def DS_ADD_RTN_U32 : DS_1A1D_RET<"ds_add_rtn_u32">, + AtomicNoRet<"ds_add_u32", 1>; +def DS_ADD_RTN_F32 : DS_1A1D_RET<"ds_add_rtn_f32">, + AtomicNoRet<"ds_add_f32", 1>; +def DS_SUB_RTN_U32 : DS_1A1D_RET<"ds_sub_rtn_u32">, + AtomicNoRet<"ds_sub_u32", 1>; +def DS_RSUB_RTN_U32 : DS_1A1D_RET<"ds_rsub_rtn_u32">, + AtomicNoRet<"ds_rsub_u32", 1>; +def DS_INC_RTN_U32 : DS_1A1D_RET<"ds_inc_rtn_u32">, + AtomicNoRet<"ds_inc_u32", 1>; +def DS_DEC_RTN_U32 : DS_1A1D_RET<"ds_dec_rtn_u32">, + AtomicNoRet<"ds_dec_u32", 1>; +def DS_MIN_RTN_I32 : DS_1A1D_RET<"ds_min_rtn_i32">, + AtomicNoRet<"ds_min_i32", 1>; +def DS_MAX_RTN_I32 : DS_1A1D_RET<"ds_max_rtn_i32">, + AtomicNoRet<"ds_max_i32", 1>; +def DS_MIN_RTN_U32 : DS_1A1D_RET<"ds_min_rtn_u32">, + AtomicNoRet<"ds_min_u32", 1>; +def DS_MAX_RTN_U32 : DS_1A1D_RET<"ds_max_rtn_u32">, + AtomicNoRet<"ds_max_u32", 1>; +def DS_AND_RTN_B32 : DS_1A1D_RET<"ds_and_rtn_b32">, + AtomicNoRet<"ds_and_b32", 1>; +def DS_OR_RTN_B32 : DS_1A1D_RET<"ds_or_rtn_b32">, + AtomicNoRet<"ds_or_b32", 1>; +def DS_XOR_RTN_B32 : DS_1A1D_RET<"ds_xor_rtn_b32">, + AtomicNoRet<"ds_xor_b32", 1>; +def DS_MSKOR_RTN_B32 : DS_1A2D_RET<"ds_mskor_rtn_b32">, + AtomicNoRet<"ds_mskor_b32", 1>; +def DS_CMPST_RTN_B32 : DS_1A2D_RET <"ds_cmpst_rtn_b32">, + AtomicNoRet<"ds_cmpst_b32", 1>; +def DS_CMPST_RTN_F32 : DS_1A2D_RET <"ds_cmpst_rtn_f32">, + AtomicNoRet<"ds_cmpst_f32", 1>; +def DS_MIN_RTN_F32 : DS_1A1D_RET <"ds_min_rtn_f32">, + AtomicNoRet<"ds_min_f32", 1>; +def DS_MAX_RTN_F32 : DS_1A1D_RET <"ds_max_rtn_f32">, + AtomicNoRet<"ds_max_f32", 1>; + +def DS_WRXCHG_RTN_B32 : DS_1A1D_RET<"ds_wrxchg_rtn_b32">, + AtomicNoRet<"", 1>; +def DS_WRXCHG2_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>, + AtomicNoRet<"", 1>; +def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>, + AtomicNoRet<"", 1>; + +def DS_ADD_RTN_U64 : DS_1A1D_RET<"ds_add_rtn_u64", VReg_64>, + AtomicNoRet<"ds_add_u64", 1>; +def DS_SUB_RTN_U64 : DS_1A1D_RET<"ds_sub_rtn_u64", VReg_64>, + AtomicNoRet<"ds_sub_u64", 1>; +def DS_RSUB_RTN_U64 : DS_1A1D_RET<"ds_rsub_rtn_u64", VReg_64>, + AtomicNoRet<"ds_rsub_u64", 1>; +def DS_INC_RTN_U64 : DS_1A1D_RET<"ds_inc_rtn_u64", VReg_64>, + AtomicNoRet<"ds_inc_u64", 1>; +def DS_DEC_RTN_U64 : DS_1A1D_RET<"ds_dec_rtn_u64", VReg_64>, + AtomicNoRet<"ds_dec_u64", 1>; +def DS_MIN_RTN_I64 : DS_1A1D_RET<"ds_min_rtn_i64", VReg_64>, + AtomicNoRet<"ds_min_i64", 1>; +def DS_MAX_RTN_I64 : DS_1A1D_RET<"ds_max_rtn_i64", VReg_64>, + AtomicNoRet<"ds_max_i64", 1>; +def DS_MIN_RTN_U64 : DS_1A1D_RET<"ds_min_rtn_u64", VReg_64>, + AtomicNoRet<"ds_min_u64", 1>; +def DS_MAX_RTN_U64 : DS_1A1D_RET<"ds_max_rtn_u64", VReg_64>, + AtomicNoRet<"ds_max_u64", 1>; +def DS_AND_RTN_B64 : DS_1A1D_RET<"ds_and_rtn_b64", VReg_64>, + AtomicNoRet<"ds_and_b64", 1>; +def DS_OR_RTN_B64 : DS_1A1D_RET<"ds_or_rtn_b64", VReg_64>, + AtomicNoRet<"ds_or_b64", 1>; +def DS_XOR_RTN_B64 : DS_1A1D_RET<"ds_xor_rtn_b64", VReg_64>, + AtomicNoRet<"ds_xor_b64", 1>; +def DS_MSKOR_RTN_B64 : DS_1A2D_RET<"ds_mskor_rtn_b64", VReg_64>, + AtomicNoRet<"ds_mskor_b64", 1>; +def DS_CMPST_RTN_B64 : DS_1A2D_RET<"ds_cmpst_rtn_b64", VReg_64>, + AtomicNoRet<"ds_cmpst_b64", 1>; +def DS_CMPST_RTN_F64 : DS_1A2D_RET<"ds_cmpst_rtn_f64", VReg_64>, + AtomicNoRet<"ds_cmpst_f64", 1>; +def DS_MIN_RTN_F64 : DS_1A1D_RET<"ds_min_rtn_f64", VReg_64>, + AtomicNoRet<"ds_min_f64", 1>; +def DS_MAX_RTN_F64 : DS_1A1D_RET<"ds_max_rtn_f64", VReg_64>, + AtomicNoRet<"ds_max_f64", 1>; + +def DS_WRXCHG_RTN_B64 : DS_1A1D_RET<"ds_wrxchg_rtn_b64", VReg_64>, + AtomicNoRet<"ds_wrxchg_b64", 1>; +def DS_WRXCHG2_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>, + AtomicNoRet<"ds_wrxchg2_b64", 1>; +def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>, + AtomicNoRet<"ds_wrxchg2st64_b64", 1>; + +def DS_GWS_INIT : DS_1A_GDS<"ds_gws_init">; +def DS_GWS_SEMA_V : DS_1A_GDS<"ds_gws_sema_v">; +def DS_GWS_SEMA_BR : DS_1A_GDS<"ds_gws_sema_br">; +def DS_GWS_SEMA_P : DS_1A_GDS<"ds_gws_sema_p">; +def DS_GWS_BARRIER : DS_1A_GDS<"ds_gws_barrier">; + +def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">; +def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">; +def DS_RSUB_SRC2_U32 : DS_1A<"ds_rsub_src2_u32">; +def DS_INC_SRC2_U32 : DS_1A<"ds_inc_src2_u32">; +def DS_DEC_SRC2_U32 : DS_1A<"ds_dec_src2_u32">; +def DS_MIN_SRC2_I32 : DS_1A<"ds_min_src2_i32">; +def DS_MAX_SRC2_I32 : DS_1A<"ds_max_src2_i32">; +def DS_MIN_SRC2_U32 : DS_1A<"ds_min_src2_u32">; +def DS_MAX_SRC2_U32 : DS_1A<"ds_max_src2_u32">; +def DS_AND_SRC2_B32 : DS_1A<"ds_and_src_b32">; +def DS_OR_SRC2_B32 : DS_1A<"ds_or_src2_b32">; +def DS_XOR_SRC2_B32 : DS_1A<"ds_xor_src2_b32">; +def DS_MIN_SRC2_F32 : DS_1A<"ds_min_src2_f32">; +def DS_MAX_SRC2_F32 : DS_1A<"ds_max_src2_f32">; + +def DS_ADD_SRC2_U64 : DS_1A<"ds_add_src2_u64">; +def DS_SUB_SRC2_U64 : DS_1A<"ds_sub_src2_u64">; +def DS_RSUB_SRC2_U64 : DS_1A<"ds_rsub_src2_u64">; +def DS_INC_SRC2_U64 : DS_1A<"ds_inc_src2_u64">; +def DS_DEC_SRC2_U64 : DS_1A<"ds_dec_src2_u64">; +def DS_MIN_SRC2_I64 : DS_1A<"ds_min_src2_i64">; +def DS_MAX_SRC2_I64 : DS_1A<"ds_max_src2_i64">; +def DS_MIN_SRC2_U64 : DS_1A<"ds_min_src2_u64">; +def DS_MAX_SRC2_U64 : DS_1A<"ds_max_src2_u64">; +def DS_AND_SRC2_B64 : DS_1A<"ds_and_src2_b64">; +def DS_OR_SRC2_B64 : DS_1A<"ds_or_src2_b64">; +def DS_XOR_SRC2_B64 : DS_1A<"ds_xor_src2_b64">; +def DS_MIN_SRC2_F64 : DS_1A<"ds_min_src2_f64">; +def DS_MAX_SRC2_F64 : DS_1A<"ds_max_src2_f64">; + +def DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET<"ds_write_src2_b32">; +def DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET<"ds_write_src2_b64">; + +let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in { +def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">; +} + +let mayStore = 0 in { +def DS_READ_I8 : DS_1A_RET<"ds_read_i8">; +def DS_READ_U8 : DS_1A_RET<"ds_read_u8">; +def DS_READ_I16 : DS_1A_RET<"ds_read_i16">; +def DS_READ_U16 : DS_1A_RET<"ds_read_u16">; +def DS_READ_B32 : DS_1A_RET<"ds_read_b32">; +def DS_READ_B64 : DS_1A_RET<"ds_read_b64", VReg_64>; + +def DS_READ2_B32 : DS_1A_Off8_RET<"ds_read2_b32", VReg_64>; +def DS_READ2ST64_B32 : DS_1A_Off8_RET<"ds_read2st64_b32", VReg_64>; + +def DS_READ2_B64 : DS_1A_Off8_RET<"ds_read2_b64", VReg_128>; +def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>; +} + +let SubtargetPredicate = isSICI in { +def DS_CONSUME : DS_0A_RET<"ds_consume">; +def DS_APPEND : DS_0A_RET<"ds_append">; +def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">; +} + +//===----------------------------------------------------------------------===// +// Instruction definitions for CI and newer. +//===----------------------------------------------------------------------===// +// Remaining instructions: +// DS_NOP +// DS_GWS_SEMA_RELEASE_ALL +// DS_WRAP_RTN_B32 +// DS_CNDXCHG32_RTN_B64 +// DS_WRITE_B96 +// DS_WRITE_B128 +// DS_CONDXCHG32_RTN_B128 +// DS_READ_B96 +// DS_READ_B128 + +let SubtargetPredicate = isCIVI in { + +def DS_WRAP_RTN_F32 : DS_1A1D_RET <"ds_wrap_rtn_f32">, + AtomicNoRet<"ds_wrap_f32", 1>; + +} // let SubtargetPredicate = isCIVI + +//===----------------------------------------------------------------------===// +// Instruction definitions for VI and newer. +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isVI in { + +let Uses = [EXEC] in { +def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32", + int_amdgcn_ds_permute>; +def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32", + int_amdgcn_ds_bpermute>; +} + +} // let SubtargetPredicate = isVI + +//===----------------------------------------------------------------------===// +// DS Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isGCN] in { + +def : Pat < + (int_amdgcn_ds_swizzle i32:$src, imm:$offset16), + (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0)) +>; + +class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat < + (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), + (inst $ptr, (as_i16imm $offset), (i1 0)) +>; + +def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>; +def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>; +def : DSReadPat <DS_READ_I8, i16, si_sextload_local_i8>; +def : DSReadPat <DS_READ_U8, i16, si_az_extload_local_i8>; +def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>; +def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>; +def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>; +def : DSReadPat <DS_READ_U16, i16, si_load_local>; +def : DSReadPat <DS_READ_B32, i32, si_load_local>; + +let AddedComplexity = 100 in { + +def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>; + +} // End AddedComplexity = 100 + +def : Pat < + (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1))), + (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) +>; + +class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat < + (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) +>; + +def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>; +def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>; +def : DSWritePat <DS_WRITE_B8, i16, si_truncstore_local_i8>; +def : DSWritePat <DS_WRITE_B16, i16, si_store_local>; +def : DSWritePat <DS_WRITE_B32, i32, si_store_local>; + +let AddedComplexity = 100 in { + +def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>; +} // End AddedComplexity = 100 + +def : Pat < + (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1)), + (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)), + (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1, + (i1 0)) +>; + +class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) +>; + +class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), + (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) +>; + + +// 32-bit atomics. +def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>; +def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>; +def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>; +def : DSAtomicRetPat<DS_INC_RTN_U32, i32, si_atomic_inc_local>; +def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, si_atomic_dec_local>; +def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>; +def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>; +def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>; +def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>; +def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>; +def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>; +def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>; +def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>; + +// 64-bit atomics. +def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>; +def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>; +def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>; +def : DSAtomicRetPat<DS_INC_RTN_U64, i64, si_atomic_inc_local>; +def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, si_atomic_dec_local>; +def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>; +def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>; +def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>; +def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>; +def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>; +def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>; +def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>; + +def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>; + +} // let Predicates = [isGCN] + +//===----------------------------------------------------------------------===// +// Real instructions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SIInstructions.td +//===----------------------------------------------------------------------===// + +class DS_Real_si <bits<8> op, DS_Pseudo ds> : + DS_Real <ds>, + SIMCInstr <ds.Mnemonic, SIEncodingFamily.SI> { + let AssemblerPredicates=[isSICI]; + let DecoderNamespace="SICI"; + + // encoding + let Inst{7-0} = !if(ds.has_offset0, offset0, 0); + let Inst{15-8} = !if(ds.has_offset1, offset1, 0); + let Inst{17} = !if(ds.has_gds, gds, ds.gdsValue); + let Inst{25-18} = op; + let Inst{31-26} = 0x36; // ds prefix + let Inst{39-32} = !if(ds.has_addr, addr, 0); + let Inst{47-40} = !if(ds.has_data0, data0, 0); + let Inst{55-48} = !if(ds.has_data1, data1, 0); + let Inst{63-56} = !if(ds.has_vdst, vdst, 0); +} + +def DS_ADD_U32_si : DS_Real_si<0x0, DS_ADD_U32>; +def DS_SUB_U32_si : DS_Real_si<0x1, DS_SUB_U32>; +def DS_RSUB_U32_si : DS_Real_si<0x2, DS_RSUB_U32>; +def DS_INC_U32_si : DS_Real_si<0x3, DS_INC_U32>; +def DS_DEC_U32_si : DS_Real_si<0x4, DS_DEC_U32>; +def DS_MIN_I32_si : DS_Real_si<0x5, DS_MIN_I32>; +def DS_MAX_I32_si : DS_Real_si<0x6, DS_MAX_I32>; +def DS_MIN_U32_si : DS_Real_si<0x7, DS_MIN_U32>; +def DS_MAX_U32_si : DS_Real_si<0x8, DS_MAX_U32>; +def DS_AND_B32_si : DS_Real_si<0x9, DS_AND_B32>; +def DS_OR_B32_si : DS_Real_si<0xa, DS_OR_B32>; +def DS_XOR_B32_si : DS_Real_si<0xb, DS_XOR_B32>; +def DS_MSKOR_B32_si : DS_Real_si<0xc, DS_MSKOR_B32>; +def DS_WRITE_B32_si : DS_Real_si<0xd, DS_WRITE_B32>; +def DS_WRITE2_B32_si : DS_Real_si<0xe, DS_WRITE2_B32>; +def DS_WRITE2ST64_B32_si : DS_Real_si<0xf, DS_WRITE2ST64_B32>; +def DS_CMPST_B32_si : DS_Real_si<0x10, DS_CMPST_B32>; +def DS_CMPST_F32_si : DS_Real_si<0x11, DS_CMPST_F32>; +def DS_MIN_F32_si : DS_Real_si<0x12, DS_MIN_F32>; +def DS_MAX_F32_si : DS_Real_si<0x13, DS_MAX_F32>; +def DS_GWS_INIT_si : DS_Real_si<0x19, DS_GWS_INIT>; +def DS_GWS_SEMA_V_si : DS_Real_si<0x1a, DS_GWS_SEMA_V>; +def DS_GWS_SEMA_BR_si : DS_Real_si<0x1b, DS_GWS_SEMA_BR>; +def DS_GWS_SEMA_P_si : DS_Real_si<0x1c, DS_GWS_SEMA_P>; +def DS_GWS_BARRIER_si : DS_Real_si<0x1d, DS_GWS_BARRIER>; +def DS_WRITE_B8_si : DS_Real_si<0x1e, DS_WRITE_B8>; +def DS_WRITE_B16_si : DS_Real_si<0x1f, DS_WRITE_B16>; +def DS_ADD_RTN_U32_si : DS_Real_si<0x20, DS_ADD_RTN_U32>; +def DS_SUB_RTN_U32_si : DS_Real_si<0x21, DS_SUB_RTN_U32>; +def DS_RSUB_RTN_U32_si : DS_Real_si<0x22, DS_RSUB_RTN_U32>; +def DS_INC_RTN_U32_si : DS_Real_si<0x23, DS_INC_RTN_U32>; +def DS_DEC_RTN_U32_si : DS_Real_si<0x24, DS_DEC_RTN_U32>; +def DS_MIN_RTN_I32_si : DS_Real_si<0x25, DS_MIN_RTN_I32>; +def DS_MAX_RTN_I32_si : DS_Real_si<0x26, DS_MAX_RTN_I32>; +def DS_MIN_RTN_U32_si : DS_Real_si<0x27, DS_MIN_RTN_U32>; +def DS_MAX_RTN_U32_si : DS_Real_si<0x28, DS_MAX_RTN_U32>; +def DS_AND_RTN_B32_si : DS_Real_si<0x29, DS_AND_RTN_B32>; +def DS_OR_RTN_B32_si : DS_Real_si<0x2a, DS_OR_RTN_B32>; +def DS_XOR_RTN_B32_si : DS_Real_si<0x2b, DS_XOR_RTN_B32>; +def DS_MSKOR_RTN_B32_si : DS_Real_si<0x2c, DS_MSKOR_RTN_B32>; +def DS_WRXCHG_RTN_B32_si : DS_Real_si<0x2d, DS_WRXCHG_RTN_B32>; +def DS_WRXCHG2_RTN_B32_si : DS_Real_si<0x2e, DS_WRXCHG2_RTN_B32>; +def DS_WRXCHG2ST64_RTN_B32_si : DS_Real_si<0x2f, DS_WRXCHG2ST64_RTN_B32>; +def DS_CMPST_RTN_B32_si : DS_Real_si<0x30, DS_CMPST_RTN_B32>; +def DS_CMPST_RTN_F32_si : DS_Real_si<0x31, DS_CMPST_RTN_F32>; +def DS_MIN_RTN_F32_si : DS_Real_si<0x32, DS_MIN_RTN_F32>; +def DS_MAX_RTN_F32_si : DS_Real_si<0x33, DS_MAX_RTN_F32>; + +// FIXME: this instruction is actually CI/VI +def DS_WRAP_RTN_F32_si : DS_Real_si<0x34, DS_WRAP_RTN_F32>; + +def DS_SWIZZLE_B32_si : DS_Real_si<0x35, DS_SWIZZLE_B32>; +def DS_READ_B32_si : DS_Real_si<0x36, DS_READ_B32>; +def DS_READ2_B32_si : DS_Real_si<0x37, DS_READ2_B32>; +def DS_READ2ST64_B32_si : DS_Real_si<0x38, DS_READ2ST64_B32>; +def DS_READ_I8_si : DS_Real_si<0x39, DS_READ_I8>; +def DS_READ_U8_si : DS_Real_si<0x3a, DS_READ_U8>; +def DS_READ_I16_si : DS_Real_si<0x3b, DS_READ_I16>; +def DS_READ_U16_si : DS_Real_si<0x3c, DS_READ_U16>; +def DS_CONSUME_si : DS_Real_si<0x3d, DS_CONSUME>; +def DS_APPEND_si : DS_Real_si<0x3e, DS_APPEND>; +def DS_ORDERED_COUNT_si : DS_Real_si<0x3f, DS_ORDERED_COUNT>; +def DS_ADD_U64_si : DS_Real_si<0x40, DS_ADD_U64>; +def DS_SUB_U64_si : DS_Real_si<0x41, DS_SUB_U64>; +def DS_RSUB_U64_si : DS_Real_si<0x42, DS_RSUB_U64>; +def DS_INC_U64_si : DS_Real_si<0x43, DS_INC_U64>; +def DS_DEC_U64_si : DS_Real_si<0x44, DS_DEC_U64>; +def DS_MIN_I64_si : DS_Real_si<0x45, DS_MIN_I64>; +def DS_MAX_I64_si : DS_Real_si<0x46, DS_MAX_I64>; +def DS_MIN_U64_si : DS_Real_si<0x47, DS_MIN_U64>; +def DS_MAX_U64_si : DS_Real_si<0x48, DS_MAX_U64>; +def DS_AND_B64_si : DS_Real_si<0x49, DS_AND_B64>; +def DS_OR_B64_si : DS_Real_si<0x4a, DS_OR_B64>; +def DS_XOR_B64_si : DS_Real_si<0x4b, DS_XOR_B64>; +def DS_MSKOR_B64_si : DS_Real_si<0x4c, DS_MSKOR_B64>; +def DS_WRITE_B64_si : DS_Real_si<0x4d, DS_WRITE_B64>; +def DS_WRITE2_B64_si : DS_Real_si<0x4E, DS_WRITE2_B64>; +def DS_WRITE2ST64_B64_si : DS_Real_si<0x4f, DS_WRITE2ST64_B64>; +def DS_CMPST_B64_si : DS_Real_si<0x50, DS_CMPST_B64>; +def DS_CMPST_F64_si : DS_Real_si<0x51, DS_CMPST_F64>; +def DS_MIN_F64_si : DS_Real_si<0x52, DS_MIN_F64>; +def DS_MAX_F64_si : DS_Real_si<0x53, DS_MAX_F64>; + +def DS_ADD_RTN_U64_si : DS_Real_si<0x60, DS_ADD_RTN_U64>; +def DS_SUB_RTN_U64_si : DS_Real_si<0x61, DS_SUB_RTN_U64>; +def DS_RSUB_RTN_U64_si : DS_Real_si<0x62, DS_RSUB_RTN_U64>; +def DS_INC_RTN_U64_si : DS_Real_si<0x63, DS_INC_RTN_U64>; +def DS_DEC_RTN_U64_si : DS_Real_si<0x64, DS_DEC_RTN_U64>; +def DS_MIN_RTN_I64_si : DS_Real_si<0x65, DS_MIN_RTN_I64>; +def DS_MAX_RTN_I64_si : DS_Real_si<0x66, DS_MAX_RTN_I64>; +def DS_MIN_RTN_U64_si : DS_Real_si<0x67, DS_MIN_RTN_U64>; +def DS_MAX_RTN_U64_si : DS_Real_si<0x68, DS_MAX_RTN_U64>; +def DS_AND_RTN_B64_si : DS_Real_si<0x69, DS_AND_RTN_B64>; +def DS_OR_RTN_B64_si : DS_Real_si<0x6a, DS_OR_RTN_B64>; +def DS_XOR_RTN_B64_si : DS_Real_si<0x6b, DS_XOR_RTN_B64>; +def DS_MSKOR_RTN_B64_si : DS_Real_si<0x6c, DS_MSKOR_RTN_B64>; +def DS_WRXCHG_RTN_B64_si : DS_Real_si<0x6d, DS_WRXCHG_RTN_B64>; +def DS_WRXCHG2_RTN_B64_si : DS_Real_si<0x6e, DS_WRXCHG2_RTN_B64>; +def DS_WRXCHG2ST64_RTN_B64_si : DS_Real_si<0x6f, DS_WRXCHG2ST64_RTN_B64>; +def DS_CMPST_RTN_B64_si : DS_Real_si<0x70, DS_CMPST_RTN_B64>; +def DS_CMPST_RTN_F64_si : DS_Real_si<0x71, DS_CMPST_RTN_F64>; +def DS_MIN_RTN_F64_si : DS_Real_si<0x72, DS_MIN_RTN_F64>; +def DS_MAX_RTN_F64_si : DS_Real_si<0x73, DS_MAX_RTN_F64>; + +def DS_READ_B64_si : DS_Real_si<0x76, DS_READ_B64>; +def DS_READ2_B64_si : DS_Real_si<0x77, DS_READ2_B64>; +def DS_READ2ST64_B64_si : DS_Real_si<0x78, DS_READ2ST64_B64>; + +def DS_ADD_SRC2_U32_si : DS_Real_si<0x80, DS_ADD_SRC2_U32>; +def DS_SUB_SRC2_U32_si : DS_Real_si<0x81, DS_SUB_SRC2_U32>; +def DS_RSUB_SRC2_U32_si : DS_Real_si<0x82, DS_RSUB_SRC2_U32>; +def DS_INC_SRC2_U32_si : DS_Real_si<0x83, DS_INC_SRC2_U32>; +def DS_DEC_SRC2_U32_si : DS_Real_si<0x84, DS_DEC_SRC2_U32>; +def DS_MIN_SRC2_I32_si : DS_Real_si<0x85, DS_MIN_SRC2_I32>; +def DS_MAX_SRC2_I32_si : DS_Real_si<0x86, DS_MAX_SRC2_I32>; +def DS_MIN_SRC2_U32_si : DS_Real_si<0x87, DS_MIN_SRC2_U32>; +def DS_MAX_SRC2_U32_si : DS_Real_si<0x88, DS_MAX_SRC2_U32>; +def DS_AND_SRC2_B32_si : DS_Real_si<0x89, DS_AND_SRC2_B32>; +def DS_OR_SRC2_B32_si : DS_Real_si<0x8a, DS_OR_SRC2_B32>; +def DS_XOR_SRC2_B32_si : DS_Real_si<0x8b, DS_XOR_SRC2_B32>; +def DS_WRITE_SRC2_B32_si : DS_Real_si<0x8d, DS_WRITE_SRC2_B32>; + +def DS_MIN_SRC2_F32_si : DS_Real_si<0x92, DS_MIN_SRC2_F32>; +def DS_MAX_SRC2_F32_si : DS_Real_si<0x93, DS_MAX_SRC2_F32>; + +def DS_ADD_SRC2_U64_si : DS_Real_si<0xc0, DS_ADD_SRC2_U64>; +def DS_SUB_SRC2_U64_si : DS_Real_si<0xc1, DS_SUB_SRC2_U64>; +def DS_RSUB_SRC2_U64_si : DS_Real_si<0xc2, DS_RSUB_SRC2_U64>; +def DS_INC_SRC2_U64_si : DS_Real_si<0xc3, DS_INC_SRC2_U64>; +def DS_DEC_SRC2_U64_si : DS_Real_si<0xc4, DS_DEC_SRC2_U64>; +def DS_MIN_SRC2_I64_si : DS_Real_si<0xc5, DS_MIN_SRC2_I64>; +def DS_MAX_SRC2_I64_si : DS_Real_si<0xc6, DS_MAX_SRC2_I64>; +def DS_MIN_SRC2_U64_si : DS_Real_si<0xc7, DS_MIN_SRC2_U64>; +def DS_MAX_SRC2_U64_si : DS_Real_si<0xc8, DS_MAX_SRC2_U64>; +def DS_AND_SRC2_B64_si : DS_Real_si<0xc9, DS_AND_SRC2_B64>; +def DS_OR_SRC2_B64_si : DS_Real_si<0xca, DS_OR_SRC2_B64>; +def DS_XOR_SRC2_B64_si : DS_Real_si<0xcb, DS_XOR_SRC2_B64>; +def DS_WRITE_SRC2_B64_si : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>; + +def DS_MIN_SRC2_F64_si : DS_Real_si<0xd2, DS_MIN_SRC2_F64>; +def DS_MAX_SRC2_F64_si : DS_Real_si<0xd3, DS_MAX_SRC2_F64>; + +//===----------------------------------------------------------------------===// +// VIInstructions.td +//===----------------------------------------------------------------------===// + +class DS_Real_vi <bits<8> op, DS_Pseudo ds> : + DS_Real <ds>, + SIMCInstr <ds.Mnemonic, SIEncodingFamily.VI> { + let AssemblerPredicates = [isVI]; + let DecoderNamespace="VI"; + + // encoding + let Inst{7-0} = !if(ds.has_offset0, offset0, 0); + let Inst{15-8} = !if(ds.has_offset1, offset1, 0); + let Inst{16} = !if(ds.has_gds, gds, ds.gdsValue); + let Inst{24-17} = op; + let Inst{31-26} = 0x36; // ds prefix + let Inst{39-32} = !if(ds.has_addr, addr, 0); + let Inst{47-40} = !if(ds.has_data0, data0, 0); + let Inst{55-48} = !if(ds.has_data1, data1, 0); + let Inst{63-56} = !if(ds.has_vdst, vdst, 0); +} + +def DS_ADD_U32_vi : DS_Real_vi<0x0, DS_ADD_U32>; +def DS_SUB_U32_vi : DS_Real_vi<0x1, DS_SUB_U32>; +def DS_RSUB_U32_vi : DS_Real_vi<0x2, DS_RSUB_U32>; +def DS_INC_U32_vi : DS_Real_vi<0x3, DS_INC_U32>; +def DS_DEC_U32_vi : DS_Real_vi<0x4, DS_DEC_U32>; +def DS_MIN_I32_vi : DS_Real_vi<0x5, DS_MIN_I32>; +def DS_MAX_I32_vi : DS_Real_vi<0x6, DS_MAX_I32>; +def DS_MIN_U32_vi : DS_Real_vi<0x7, DS_MIN_U32>; +def DS_MAX_U32_vi : DS_Real_vi<0x8, DS_MAX_U32>; +def DS_AND_B32_vi : DS_Real_vi<0x9, DS_AND_B32>; +def DS_OR_B32_vi : DS_Real_vi<0xa, DS_OR_B32>; +def DS_XOR_B32_vi : DS_Real_vi<0xb, DS_XOR_B32>; +def DS_MSKOR_B32_vi : DS_Real_vi<0xc, DS_MSKOR_B32>; +def DS_WRITE_B32_vi : DS_Real_vi<0xd, DS_WRITE_B32>; +def DS_WRITE2_B32_vi : DS_Real_vi<0xe, DS_WRITE2_B32>; +def DS_WRITE2ST64_B32_vi : DS_Real_vi<0xf, DS_WRITE2ST64_B32>; +def DS_CMPST_B32_vi : DS_Real_vi<0x10, DS_CMPST_B32>; +def DS_CMPST_F32_vi : DS_Real_vi<0x11, DS_CMPST_F32>; +def DS_MIN_F32_vi : DS_Real_vi<0x12, DS_MIN_F32>; +def DS_MAX_F32_vi : DS_Real_vi<0x13, DS_MAX_F32>; +def DS_ADD_F32_vi : DS_Real_vi<0x15, DS_ADD_F32>; +def DS_GWS_INIT_vi : DS_Real_vi<0x19, DS_GWS_INIT>; +def DS_GWS_SEMA_V_vi : DS_Real_vi<0x1a, DS_GWS_SEMA_V>; +def DS_GWS_SEMA_BR_vi : DS_Real_vi<0x1b, DS_GWS_SEMA_BR>; +def DS_GWS_SEMA_P_vi : DS_Real_vi<0x1c, DS_GWS_SEMA_P>; +def DS_GWS_BARRIER_vi : DS_Real_vi<0x1d, DS_GWS_BARRIER>; +def DS_WRITE_B8_vi : DS_Real_vi<0x1e, DS_WRITE_B8>; +def DS_WRITE_B16_vi : DS_Real_vi<0x1f, DS_WRITE_B16>; +def DS_ADD_RTN_U32_vi : DS_Real_vi<0x20, DS_ADD_RTN_U32>; +def DS_SUB_RTN_U32_vi : DS_Real_vi<0x21, DS_SUB_RTN_U32>; +def DS_RSUB_RTN_U32_vi : DS_Real_vi<0x22, DS_RSUB_RTN_U32>; +def DS_INC_RTN_U32_vi : DS_Real_vi<0x23, DS_INC_RTN_U32>; +def DS_DEC_RTN_U32_vi : DS_Real_vi<0x24, DS_DEC_RTN_U32>; +def DS_MIN_RTN_I32_vi : DS_Real_vi<0x25, DS_MIN_RTN_I32>; +def DS_MAX_RTN_I32_vi : DS_Real_vi<0x26, DS_MAX_RTN_I32>; +def DS_MIN_RTN_U32_vi : DS_Real_vi<0x27, DS_MIN_RTN_U32>; +def DS_MAX_RTN_U32_vi : DS_Real_vi<0x28, DS_MAX_RTN_U32>; +def DS_AND_RTN_B32_vi : DS_Real_vi<0x29, DS_AND_RTN_B32>; +def DS_OR_RTN_B32_vi : DS_Real_vi<0x2a, DS_OR_RTN_B32>; +def DS_XOR_RTN_B32_vi : DS_Real_vi<0x2b, DS_XOR_RTN_B32>; +def DS_MSKOR_RTN_B32_vi : DS_Real_vi<0x2c, DS_MSKOR_RTN_B32>; +def DS_WRXCHG_RTN_B32_vi : DS_Real_vi<0x2d, DS_WRXCHG_RTN_B32>; +def DS_WRXCHG2_RTN_B32_vi : DS_Real_vi<0x2e, DS_WRXCHG2_RTN_B32>; +def DS_WRXCHG2ST64_RTN_B32_vi : DS_Real_vi<0x2f, DS_WRXCHG2ST64_RTN_B32>; +def DS_CMPST_RTN_B32_vi : DS_Real_vi<0x30, DS_CMPST_RTN_B32>; +def DS_CMPST_RTN_F32_vi : DS_Real_vi<0x31, DS_CMPST_RTN_F32>; +def DS_MIN_RTN_F32_vi : DS_Real_vi<0x32, DS_MIN_RTN_F32>; +def DS_MAX_RTN_F32_vi : DS_Real_vi<0x33, DS_MAX_RTN_F32>; +def DS_WRAP_RTN_F32_vi : DS_Real_vi<0x34, DS_WRAP_RTN_F32>; +def DS_ADD_RTN_F32_vi : DS_Real_vi<0x35, DS_ADD_RTN_F32>; +def DS_READ_B32_vi : DS_Real_vi<0x36, DS_READ_B32>; +def DS_READ2_B32_vi : DS_Real_vi<0x37, DS_READ2_B32>; +def DS_READ2ST64_B32_vi : DS_Real_vi<0x38, DS_READ2ST64_B32>; +def DS_READ_I8_vi : DS_Real_vi<0x39, DS_READ_I8>; +def DS_READ_U8_vi : DS_Real_vi<0x3a, DS_READ_U8>; +def DS_READ_I16_vi : DS_Real_vi<0x3b, DS_READ_I16>; +def DS_READ_U16_vi : DS_Real_vi<0x3c, DS_READ_U16>; +def DS_SWIZZLE_B32_vi : DS_Real_vi<0x3d, DS_SWIZZLE_B32>; +def DS_PERMUTE_B32_vi : DS_Real_vi<0x3e, DS_PERMUTE_B32>; +def DS_BPERMUTE_B32_vi : DS_Real_vi<0x3f, DS_BPERMUTE_B32>; + +def DS_ADD_U64_vi : DS_Real_vi<0x40, DS_ADD_U64>; +def DS_SUB_U64_vi : DS_Real_vi<0x41, DS_SUB_U64>; +def DS_RSUB_U64_vi : DS_Real_vi<0x42, DS_RSUB_U64>; +def DS_INC_U64_vi : DS_Real_vi<0x43, DS_INC_U64>; +def DS_DEC_U64_vi : DS_Real_vi<0x44, DS_DEC_U64>; +def DS_MIN_I64_vi : DS_Real_vi<0x45, DS_MIN_I64>; +def DS_MAX_I64_vi : DS_Real_vi<0x46, DS_MAX_I64>; +def DS_MIN_U64_vi : DS_Real_vi<0x47, DS_MIN_U64>; +def DS_MAX_U64_vi : DS_Real_vi<0x48, DS_MAX_U64>; +def DS_AND_B64_vi : DS_Real_vi<0x49, DS_AND_B64>; +def DS_OR_B64_vi : DS_Real_vi<0x4a, DS_OR_B64>; +def DS_XOR_B64_vi : DS_Real_vi<0x4b, DS_XOR_B64>; +def DS_MSKOR_B64_vi : DS_Real_vi<0x4c, DS_MSKOR_B64>; +def DS_WRITE_B64_vi : DS_Real_vi<0x4d, DS_WRITE_B64>; +def DS_WRITE2_B64_vi : DS_Real_vi<0x4E, DS_WRITE2_B64>; +def DS_WRITE2ST64_B64_vi : DS_Real_vi<0x4f, DS_WRITE2ST64_B64>; +def DS_CMPST_B64_vi : DS_Real_vi<0x50, DS_CMPST_B64>; +def DS_CMPST_F64_vi : DS_Real_vi<0x51, DS_CMPST_F64>; +def DS_MIN_F64_vi : DS_Real_vi<0x52, DS_MIN_F64>; +def DS_MAX_F64_vi : DS_Real_vi<0x53, DS_MAX_F64>; + +def DS_ADD_RTN_U64_vi : DS_Real_vi<0x60, DS_ADD_RTN_U64>; +def DS_SUB_RTN_U64_vi : DS_Real_vi<0x61, DS_SUB_RTN_U64>; +def DS_RSUB_RTN_U64_vi : DS_Real_vi<0x62, DS_RSUB_RTN_U64>; +def DS_INC_RTN_U64_vi : DS_Real_vi<0x63, DS_INC_RTN_U64>; +def DS_DEC_RTN_U64_vi : DS_Real_vi<0x64, DS_DEC_RTN_U64>; +def DS_MIN_RTN_I64_vi : DS_Real_vi<0x65, DS_MIN_RTN_I64>; +def DS_MAX_RTN_I64_vi : DS_Real_vi<0x66, DS_MAX_RTN_I64>; +def DS_MIN_RTN_U64_vi : DS_Real_vi<0x67, DS_MIN_RTN_U64>; +def DS_MAX_RTN_U64_vi : DS_Real_vi<0x68, DS_MAX_RTN_U64>; +def DS_AND_RTN_B64_vi : DS_Real_vi<0x69, DS_AND_RTN_B64>; +def DS_OR_RTN_B64_vi : DS_Real_vi<0x6a, DS_OR_RTN_B64>; +def DS_XOR_RTN_B64_vi : DS_Real_vi<0x6b, DS_XOR_RTN_B64>; +def DS_MSKOR_RTN_B64_vi : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>; +def DS_WRXCHG_RTN_B64_vi : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>; +def DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>; +def DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>; +def DS_CMPST_RTN_B64_vi : DS_Real_vi<0x70, DS_CMPST_RTN_B64>; +def DS_CMPST_RTN_F64_vi : DS_Real_vi<0x71, DS_CMPST_RTN_F64>; +def DS_MIN_RTN_F64_vi : DS_Real_vi<0x72, DS_MIN_RTN_F64>; +def DS_MAX_RTN_F64_vi : DS_Real_vi<0x73, DS_MAX_RTN_F64>; + +def DS_READ_B64_vi : DS_Real_vi<0x76, DS_READ_B64>; +def DS_READ2_B64_vi : DS_Real_vi<0x77, DS_READ2_B64>; +def DS_READ2ST64_B64_vi : DS_Real_vi<0x78, DS_READ2ST64_B64>; + +def DS_ADD_SRC2_U32_vi : DS_Real_vi<0x80, DS_ADD_SRC2_U32>; +def DS_SUB_SRC2_U32_vi : DS_Real_vi<0x81, DS_SUB_SRC2_U32>; +def DS_RSUB_SRC2_U32_vi : DS_Real_vi<0x82, DS_RSUB_SRC2_U32>; +def DS_INC_SRC2_U32_vi : DS_Real_vi<0x83, DS_INC_SRC2_U32>; +def DS_DEC_SRC2_U32_vi : DS_Real_vi<0x84, DS_DEC_SRC2_U32>; +def DS_MIN_SRC2_I32_vi : DS_Real_vi<0x85, DS_MIN_SRC2_I32>; +def DS_MAX_SRC2_I32_vi : DS_Real_vi<0x86, DS_MAX_SRC2_I32>; +def DS_MIN_SRC2_U32_vi : DS_Real_vi<0x87, DS_MIN_SRC2_U32>; +def DS_MAX_SRC2_U32_vi : DS_Real_vi<0x88, DS_MAX_SRC2_U32>; +def DS_AND_SRC2_B32_vi : DS_Real_vi<0x89, DS_AND_SRC2_B32>; +def DS_OR_SRC2_B32_vi : DS_Real_vi<0x8a, DS_OR_SRC2_B32>; +def DS_XOR_SRC2_B32_vi : DS_Real_vi<0x8b, DS_XOR_SRC2_B32>; +def DS_WRITE_SRC2_B32_vi : DS_Real_vi<0x8d, DS_WRITE_SRC2_B32>; +def DS_MIN_SRC2_F32_vi : DS_Real_vi<0x92, DS_MIN_SRC2_F32>; +def DS_MAX_SRC2_F32_vi : DS_Real_vi<0x93, DS_MAX_SRC2_F32>; +def DS_ADD_SRC2_U64_vi : DS_Real_vi<0xc0, DS_ADD_SRC2_U64>; +def DS_SUB_SRC2_U64_vi : DS_Real_vi<0xc1, DS_SUB_SRC2_U64>; +def DS_RSUB_SRC2_U64_vi : DS_Real_vi<0xc2, DS_RSUB_SRC2_U64>; +def DS_INC_SRC2_U64_vi : DS_Real_vi<0xc3, DS_INC_SRC2_U64>; +def DS_DEC_SRC2_U64_vi : DS_Real_vi<0xc4, DS_DEC_SRC2_U64>; +def DS_MIN_SRC2_I64_vi : DS_Real_vi<0xc5, DS_MIN_SRC2_I64>; +def DS_MAX_SRC2_I64_vi : DS_Real_vi<0xc6, DS_MAX_SRC2_I64>; +def DS_MIN_SRC2_U64_vi : DS_Real_vi<0xc7, DS_MIN_SRC2_U64>; +def DS_MAX_SRC2_U64_vi : DS_Real_vi<0xc8, DS_MAX_SRC2_U64>; +def DS_AND_SRC2_B64_vi : DS_Real_vi<0xc9, DS_AND_SRC2_B64>; +def DS_OR_SRC2_B64_vi : DS_Real_vi<0xca, DS_OR_SRC2_B64>; +def DS_XOR_SRC2_B64_vi : DS_Real_vi<0xcb, DS_XOR_SRC2_B64>; +def DS_WRITE_SRC2_B64_vi : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>; +def DS_MIN_SRC2_F64_vi : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>; +def DS_MAX_SRC2_F64_vi : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>; diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index e11de85..2247cad 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -28,6 +28,7 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/Endian.h" #include "llvm/Support/Debug.h" #include "llvm/Support/TargetRegistry.h" @@ -48,6 +49,18 @@ addOperand(MCInst &Inst, const MCOperand& Opnd) { MCDisassembler::SoftFail; } +static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + + APInt SignedOffset(18, Imm * 4, true); + int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue(); + + if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2)) + return MCDisassembler::Success; + return addOperand(Inst, MCOperand::createImm(Imm)); +} + #define DECODE_OPERAND2(RegClass, DecName) \ static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \ unsigned Imm, \ @@ -68,12 +81,22 @@ DECODE_OPERAND(VReg_96) DECODE_OPERAND(VReg_128) DECODE_OPERAND(SReg_32) -DECODE_OPERAND(SReg_32_XM0) +DECODE_OPERAND(SReg_32_XM0_XEXEC) DECODE_OPERAND(SReg_64) +DECODE_OPERAND(SReg_64_XEXEC) DECODE_OPERAND(SReg_128) DECODE_OPERAND(SReg_256) DECODE_OPERAND(SReg_512) + +static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm)); +} + #define GET_SUBTARGETINFO_ENUM #include "AMDGPUGenSubtargetInfo.inc" #undef GET_SUBTARGETINFO_ENUM @@ -217,12 +240,14 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID, // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in // this bundle? default: - assert(false); - break; + llvm_unreachable("unhandled register class"); } - if (Val % (1 << shift)) + + if (Val % (1 << shift)) { *CommentStream << "Warning: " << getRegClassName(SRegClassID) << ": scalar reg isn't aligned " << Val; + } + return createRegOperand(SRegClassID, Val >> shift); } @@ -234,7 +259,16 @@ MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const { return decodeSrcOp(OPW64, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const { + return decodeSrcOp(OPW16, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { + // Some instructions have operand restrictions beyond what the encoding + // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra + // high bit. + Val &= 255; + return createRegOperand(AMDGPU::VGPR_32RegClassID, Val); } @@ -257,13 +291,17 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const { return decodeSrcOp(OPW32, Val); } -MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0(unsigned Val) const { - // SReg_32_XM0 is SReg_32 without M0 +MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0_XEXEC( + unsigned Val) const { + // SReg_32_XM0 is SReg_32 without M0 or EXEC_LO/EXEC_HI return decodeOperand_SReg_32(Val); } MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const { - // see decodeOperand_SReg_32 comment + return decodeSrcOp(OPW64, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_64_XEXEC(unsigned Val) const { return decodeSrcOp(OPW64, Val); } @@ -299,28 +337,96 @@ MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) { // Cast prevents negative overflow. } -MCOperand AMDGPUDisassembler::decodeFPImmed(bool Is32, unsigned Imm) { +static int64_t getInlineImmVal32(unsigned Imm) { + switch (Imm) { + case 240: + return FloatToBits(0.5f); + case 241: + return FloatToBits(-0.5f); + case 242: + return FloatToBits(1.0f); + case 243: + return FloatToBits(-1.0f); + case 244: + return FloatToBits(2.0f); + case 245: + return FloatToBits(-2.0f); + case 246: + return FloatToBits(4.0f); + case 247: + return FloatToBits(-4.0f); + case 248: // 1 / (2 * PI) + return 0x3e22f983; + default: + llvm_unreachable("invalid fp inline imm"); + } +} + +static int64_t getInlineImmVal64(unsigned Imm) { + switch (Imm) { + case 240: + return DoubleToBits(0.5); + case 241: + return DoubleToBits(-0.5); + case 242: + return DoubleToBits(1.0); + case 243: + return DoubleToBits(-1.0); + case 244: + return DoubleToBits(2.0); + case 245: + return DoubleToBits(-2.0); + case 246: + return DoubleToBits(4.0); + case 247: + return DoubleToBits(-4.0); + case 248: // 1 / (2 * PI) + return 0x3fc45f306dc9c882; + default: + llvm_unreachable("invalid fp inline imm"); + } +} + +static int64_t getInlineImmVal16(unsigned Imm) { + switch (Imm) { + case 240: + return 0x3800; + case 241: + return 0xB800; + case 242: + return 0x3C00; + case 243: + return 0xBC00; + case 244: + return 0x4000; + case 245: + return 0xC000; + case 246: + return 0x4400; + case 247: + return 0xC400; + case 248: // 1 / (2 * PI) + return 0x3118; + default: + llvm_unreachable("invalid fp inline imm"); + } +} + +MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) { assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX); + // ToDo: case 248: 1/(2*PI) - is allowed only on VI - // ToDo: AMDGPUInstPrinter does not support 1/(2*PI). It consider 1/(2*PI) as - // literal constant. - float V = 0.0f; - switch (Imm) { - case 240: V = 0.5f; break; - case 241: V = -0.5f; break; - case 242: V = 1.0f; break; - case 243: V = -1.0f; break; - case 244: V = 2.0f; break; - case 245: V = -2.0f; break; - case 246: V = 4.0f; break; - case 247: V = -4.0f; break; - case 248: return MCOperand::createImm(Is32 ? // 1/(2*PI) - 0x3e22f983 : - 0x3fc45f306dc9c882); - default: break; + switch (Width) { + case OPW32: + return MCOperand::createImm(getInlineImmVal32(Imm)); + case OPW64: + return MCOperand::createImm(getInlineImmVal64(Imm)); + case OPW16: + return MCOperand::createImm(getInlineImmVal16(Imm)); + default: + llvm_unreachable("implement me"); } - return MCOperand::createImm(Is32? FloatToBits(V) : DoubleToBits(V)); } unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { @@ -328,7 +434,9 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); switch (Width) { default: // fall - case OPW32: return VGPR_32RegClassID; + case OPW32: + case OPW16: + return VGPR_32RegClassID; case OPW64: return VReg_64RegClassID; case OPW128: return VReg_128RegClassID; } @@ -339,7 +447,9 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); switch (Width) { default: // fall - case OPW32: return SGPR_32RegClassID; + case OPW32: + case OPW16: + return SGPR_32RegClassID; case OPW64: return SGPR_64RegClassID; case OPW128: return SGPR_128RegClassID; } @@ -350,7 +460,9 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); switch (Width) { default: // fall - case OPW32: return TTMP_32RegClassID; + case OPW32: + case OPW16: + return TTMP_32RegClassID; case OPW64: return TTMP_64RegClassID; case OPW128: return TTMP_128RegClassID; } @@ -371,19 +483,26 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN); } - assert(Width == OPW32 || Width == OPW64); - const bool Is32 = (Width == OPW32); + assert(Width == OPW16 || Width == OPW32 || Width == OPW64); if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX) return decodeIntImmed(Val); if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX) - return decodeFPImmed(Is32, Val); + return decodeFPImmed(Width, Val); if (Val == LITERAL_CONST) return decodeLiteralConstant(); - return Is32 ? decodeSpecialReg32(Val) : decodeSpecialReg64(Val); + switch (Width) { + case OPW32: + case OPW16: + return decodeSpecialReg32(Val); + case OPW64: + return decodeSpecialReg64(Val); + default: + llvm_unreachable("unexpected immediate type"); + } } MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { @@ -426,6 +545,56 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { return errOperand(Val, "unknown operand encoding " + Twine(Val)); } +//===----------------------------------------------------------------------===// +// AMDGPUSymbolizer +//===----------------------------------------------------------------------===// + +// Try to find symbol name for specified label +bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst, + raw_ostream &/*cStream*/, int64_t Value, + uint64_t /*Address*/, bool IsBranch, + uint64_t /*Offset*/, uint64_t /*InstSize*/) { + typedef std::tuple<uint64_t, StringRef, uint8_t> SymbolInfoTy; + typedef std::vector<SymbolInfoTy> SectionSymbolsTy; + + if (!IsBranch) { + return false; + } + + auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo); + auto Result = std::find_if(Symbols->begin(), Symbols->end(), + [Value](const SymbolInfoTy& Val) { + return std::get<0>(Val) == static_cast<uint64_t>(Value) + && std::get<2>(Val) == ELF::STT_NOTYPE; + }); + if (Result != Symbols->end()) { + auto *Sym = Ctx.getOrCreateSymbol(std::get<1>(*Result)); + const auto *Add = MCSymbolRefExpr::create(Sym, Ctx); + Inst.addOperand(MCOperand::createExpr(Add)); + return true; + } + return false; +} + +void AMDGPUSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream, + int64_t Value, + uint64_t Address) { + llvm_unreachable("unimplemented"); +} + +//===----------------------------------------------------------------------===// +// Initialization +//===----------------------------------------------------------------------===// + +static MCSymbolizer *createAMDGPUSymbolizer(const Triple &/*TT*/, + LLVMOpInfoCallback /*GetOpInfo*/, + LLVMSymbolLookupCallback /*SymbolLookUp*/, + void *DisInfo, + MCContext *Ctx, + std::unique_ptr<MCRelocationInfo> &&RelInfo) { + return new AMDGPUSymbolizer(*Ctx, std::move(RelInfo), DisInfo); +} + static MCDisassembler *createAMDGPUDisassembler(const Target &T, const MCSubtargetInfo &STI, MCContext &Ctx) { @@ -433,5 +602,8 @@ static MCDisassembler *createAMDGPUDisassembler(const Target &T, } extern "C" void LLVMInitializeAMDGPUDisassembler() { - TargetRegistry::RegisterMCDisassembler(TheGCNTarget, createAMDGPUDisassembler); + TargetRegistry::RegisterMCDisassembler(getTheGCNTarget(), + createAMDGPUDisassembler); + TargetRegistry::RegisterMCSymbolizer(getTheGCNTarget(), + createAMDGPUSymbolizer); } diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index dff26a0..ee5883a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -18,76 +18,113 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" +#include "llvm/MC/MCDisassembler/MCRelocationInfo.h" +#include "llvm/MC/MCDisassembler/MCSymbolizer.h" +#include <cstdint> +#include <algorithm> +#include <memory> namespace llvm { - class MCContext; - class MCInst; - class MCOperand; - class MCSubtargetInfo; - class Twine; - - class AMDGPUDisassembler : public MCDisassembler { - private: - mutable ArrayRef<uint8_t> Bytes; - - public: - AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : - MCDisassembler(STI, Ctx) {} - - ~AMDGPUDisassembler() {} - - DecodeStatus getInstruction(MCInst &MI, uint64_t &Size, - ArrayRef<uint8_t> Bytes, uint64_t Address, - raw_ostream &WS, raw_ostream &CS) const override; - - const char* getRegClassName(unsigned RegClassID) const; - - MCOperand createRegOperand(unsigned int RegId) const; - MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const; - MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const; - - MCOperand errOperand(unsigned V, const llvm::Twine& ErrMsg) const; - - DecodeStatus tryDecodeInst(const uint8_t* Table, - MCInst &MI, - uint64_t Inst, - uint64_t Address) const; - - MCOperand decodeOperand_VGPR_32(unsigned Val) const; - MCOperand decodeOperand_VS_32(unsigned Val) const; - MCOperand decodeOperand_VS_64(unsigned Val) const; - - MCOperand decodeOperand_VReg_64(unsigned Val) const; - MCOperand decodeOperand_VReg_96(unsigned Val) const; - MCOperand decodeOperand_VReg_128(unsigned Val) const; - - MCOperand decodeOperand_SReg_32(unsigned Val) const; - MCOperand decodeOperand_SReg_32_XM0(unsigned Val) const; - MCOperand decodeOperand_SReg_64(unsigned Val) const; - MCOperand decodeOperand_SReg_128(unsigned Val) const; - MCOperand decodeOperand_SReg_256(unsigned Val) const; - MCOperand decodeOperand_SReg_512(unsigned Val) const; - - enum OpWidthTy { - OPW32, - OPW64, - OPW128, - OPW_LAST_, - OPW_FIRST_ = OPW32 - }; - unsigned getVgprClassId(const OpWidthTy Width) const; - unsigned getSgprClassId(const OpWidthTy Width) const; - unsigned getTtmpClassId(const OpWidthTy Width) const; - - static MCOperand decodeIntImmed(unsigned Imm); - static MCOperand decodeFPImmed(bool Is32, unsigned Imm); - MCOperand decodeLiteralConstant() const; - - MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const; - MCOperand decodeSpecialReg32(unsigned Val) const; - MCOperand decodeSpecialReg64(unsigned Val) const; +class MCContext; +class MCInst; +class MCOperand; +class MCSubtargetInfo; +class Twine; + +//===----------------------------------------------------------------------===// +// AMDGPUDisassembler +//===----------------------------------------------------------------------===// + +class AMDGPUDisassembler : public MCDisassembler { +private: + mutable ArrayRef<uint8_t> Bytes; + +public: + AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : + MCDisassembler(STI, Ctx) {} + + ~AMDGPUDisassembler() override = default; + + DecodeStatus getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef<uint8_t> Bytes, uint64_t Address, + raw_ostream &WS, raw_ostream &CS) const override; + + const char* getRegClassName(unsigned RegClassID) const; + + MCOperand createRegOperand(unsigned int RegId) const; + MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const; + MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const; + + MCOperand errOperand(unsigned V, const Twine& ErrMsg) const; + + DecodeStatus tryDecodeInst(const uint8_t* Table, + MCInst &MI, + uint64_t Inst, + uint64_t Address) const; + + MCOperand decodeOperand_VGPR_32(unsigned Val) const; + MCOperand decodeOperand_VS_32(unsigned Val) const; + MCOperand decodeOperand_VS_64(unsigned Val) const; + MCOperand decodeOperand_VSrc16(unsigned Val) const; + + MCOperand decodeOperand_VReg_64(unsigned Val) const; + MCOperand decodeOperand_VReg_96(unsigned Val) const; + MCOperand decodeOperand_VReg_128(unsigned Val) const; + + MCOperand decodeOperand_SReg_32(unsigned Val) const; + MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const; + MCOperand decodeOperand_SReg_64(unsigned Val) const; + MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const; + MCOperand decodeOperand_SReg_128(unsigned Val) const; + MCOperand decodeOperand_SReg_256(unsigned Val) const; + MCOperand decodeOperand_SReg_512(unsigned Val) const; + + enum OpWidthTy { + OPW32, + OPW64, + OPW128, + OPW16, + OPW_LAST_, + OPW_FIRST_ = OPW32 }; -} // namespace llvm -#endif //LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H + unsigned getVgprClassId(const OpWidthTy Width) const; + unsigned getSgprClassId(const OpWidthTy Width) const; + unsigned getTtmpClassId(const OpWidthTy Width) const; + + static MCOperand decodeIntImmed(unsigned Imm); + static MCOperand decodeFPImmed(OpWidthTy Width, unsigned Imm); + MCOperand decodeLiteralConstant() const; + + MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeSpecialReg32(unsigned Val) const; + MCOperand decodeSpecialReg64(unsigned Val) const; +}; + +//===----------------------------------------------------------------------===// +// AMDGPUSymbolizer +//===----------------------------------------------------------------------===// + +class AMDGPUSymbolizer : public MCSymbolizer { +private: + void *DisInfo; + +public: + AMDGPUSymbolizer(MCContext &Ctx, std::unique_ptr<MCRelocationInfo> &&RelInfo, + void *disInfo) + : MCSymbolizer(Ctx, std::move(RelInfo)), DisInfo(disInfo) {} + + bool tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &cStream, + int64_t Value, uint64_t Address, + bool IsBranch, uint64_t Offset, + uint64_t InstSize) override; + + void tryAddingPcLoadReferenceComment(raw_ostream &cStream, + int64_t Value, + uint64_t Address) override; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 94f05cc..48c6592 100644 --- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -72,6 +72,8 @@ def MULLO_INT_eg : MULLO_INT_Common<0x8F>; def MULHI_INT_eg : MULHI_INT_Common<0x90>; def MULLO_UINT_eg : MULLO_UINT_Common<0x91>; def MULHI_UINT_eg : MULHI_UINT_Common<0x92>; +def MULHI_UINT24_eg : MULHI_UINT24_Common<0xb2>; + def RECIP_UINT_eg : RECIP_UINT_Common<0x94>; def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>; def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; @@ -116,14 +118,13 @@ def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>; } // End usesCustomInserter = 1 -class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern> - : VTX_WORD0_eg, VTX_READ<name, buffer_id, outs, pattern> { +class VTX_READ_eg <string name, dag outs> + : VTX_WORD0_eg, VTX_READ<name, outs, []> { // Static fields let VC_INST = 0; let FETCH_TYPE = 2; let FETCH_WHOLE_QUAD = 0; - let BUFFER_ID = buffer_id; let SRC_REL = 0; // XXX: We can infer this field based on the SRC_GPR. This would allow us // to store vertex addresses in any channel, not just X. @@ -132,9 +133,9 @@ class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern> let Inst{31-0} = Word0; } -class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern> - : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { +def VTX_READ_8_eg + : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", + (outs R600_TReg32_X:$dst_gpr)> { let MEGA_FETCH_COUNT = 1; let DST_SEL_X = 0; @@ -144,9 +145,9 @@ class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern> let DATA_FORMAT = 1; // FMT_8 } -class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern> - : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { +def VTX_READ_16_eg + : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", + (outs R600_TReg32_X:$dst_gpr)> { let MEGA_FETCH_COUNT = 2; let DST_SEL_X = 0; let DST_SEL_Y = 7; // Masked @@ -156,9 +157,9 @@ class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern> } -class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern> - : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { +def VTX_READ_32_eg + : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", + (outs R600_TReg32_X:$dst_gpr)> { let MEGA_FETCH_COUNT = 4; let DST_SEL_X = 0; @@ -177,9 +178,9 @@ class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern> let Constraints = "$src_gpr.ptr = $dst_gpr"; } -class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern> - : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id, - (outs R600_Reg64:$dst_gpr), pattern> { +def VTX_READ_64_eg + : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", + (outs R600_Reg64:$dst_gpr)> { let MEGA_FETCH_COUNT = 8; let DST_SEL_X = 0; @@ -189,9 +190,9 @@ class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern> let DATA_FORMAT = 0x1D; // COLOR_32_32 } -class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern> - : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, - (outs R600_Reg128:$dst_gpr), pattern> { +def VTX_READ_128_eg + : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", + (outs R600_Reg128:$dst_gpr)> { let MEGA_FETCH_COUNT = 16; let DST_SEL_X = 0; @@ -209,80 +210,44 @@ class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern> //===----------------------------------------------------------------------===// // VTX Read from parameter memory space //===----------------------------------------------------------------------===// +def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)), + (VTX_READ_8_eg MEMxi:$src_gpr, 3)>; +def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)), + (VTX_READ_16_eg MEMxi:$src_gpr, 3)>; +def : Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_32_eg MEMxi:$src_gpr, 3)>; +def : Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_64_eg MEMxi:$src_gpr, 3)>; +def : Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_128_eg MEMxi:$src_gpr, 3)>; -def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <3, - [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <3, - [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <3, - [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <3, - [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <3, - [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; +//===----------------------------------------------------------------------===// +// VTX Read from constant memory space +//===----------------------------------------------------------------------===// +def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)), + (VTX_READ_8_eg MEMxi:$src_gpr, 2)>; +def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)), + (VTX_READ_16_eg MEMxi:$src_gpr, 2)>; +def : Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_32_eg MEMxi:$src_gpr, 2)>; +def : Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_64_eg MEMxi:$src_gpr, 2)>; +def : Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_128_eg MEMxi:$src_gpr, 2)>; //===----------------------------------------------------------------------===// // VTX Read from global memory space //===----------------------------------------------------------------------===// - -// 8-bit reads -def VTX_READ_ID1_8_eg : VTX_READ_8_eg <1, - [(set i32:$dst_gpr, (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr))] ->; - -// 16-bit reads -def VTX_READ_ID1_16_eg : VTX_READ_16_eg <1, - [(set i32:$dst_gpr, (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr))] ->; - -// 32-bit reads -def VTX_READ_ID1_32_eg : VTX_READ_32_eg <1, - [(set i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] ->; - -// 64-bit reads -def VTX_READ_ID1_64_eg : VTX_READ_64_eg <1, - [(set v2i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] ->; - -// 128-bit reads -def VTX_READ_ID1_128_eg : VTX_READ_128_eg <1, - [(set v4i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] ->; - -// 8-bit reads -def VTX_READ_ID2_8_eg : VTX_READ_8_eg <2, - [(set i32:$dst_gpr, (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr))] ->; - -// 16-bit reads -def VTX_READ_ID2_16_eg : VTX_READ_16_eg <2, - [(set i32:$dst_gpr, (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr))] ->; - -// 32-bit reads -def VTX_READ_ID2_32_eg : VTX_READ_32_eg <2, - [(set i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] ->; - -// 64-bit reads -def VTX_READ_ID2_64_eg : VTX_READ_64_eg <2, - [(set v2i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] ->; - -// 128-bit reads -def VTX_READ_ID2_128_eg : VTX_READ_128_eg <2, - [(set v4i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] ->; +def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)), + (VTX_READ_8_eg MEMxi:$src_gpr, 1)>; +def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)), + (VTX_READ_16_eg MEMxi:$src_gpr, 1)>; +def : Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_32_eg MEMxi:$src_gpr, 1)>; +def : Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_64_eg MEMxi:$src_gpr, 1)>; +def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), + (VTX_READ_128_eg MEMxi:$src_gpr, 1)>; } // End Predicates = [isEG] @@ -368,11 +333,13 @@ def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24", def DOT4_eg : DOT4_Common<0xBE>; defm CUBE_eg : CUBE_Common<0xC0>; -def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; +def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", fp_to_f16, VecALU>; +def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>; +def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>; def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; diff --git a/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td new file mode 100644 index 0000000..849fb8a --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -0,0 +1,530 @@ +//===-- FLATInstructions.td - FLAT Instruction Defintions -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">; + +//===----------------------------------------------------------------------===// +// FLAT classes +//===----------------------------------------------------------------------===// + +class FLAT_Pseudo<string opName, dag outs, dag ins, + string asmOps, list<dag> pattern=[]> : + InstSI<outs, ins, "", pattern>, + SIMCInstr<opName, SIEncodingFamily.NONE> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + + let SubtargetPredicate = isCIVI; + + let FLAT = 1; + // Internally, FLAT instruction are executed as both an LDS and a + // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT + // and are not considered done until both have been decremented. + let VM_CNT = 1; + let LGKM_CNT = 1; + + let Uses = [EXEC, FLAT_SCR]; // M0 + + let UseNamedOperandTable = 1; + let hasSideEffects = 0; + let SchedRW = [WriteVMEM]; + + string Mnemonic = opName; + string AsmOperands = asmOps; + + bits<1> has_vdst = 1; + bits<1> has_data = 1; + bits<1> has_glc = 1; + bits<1> glcValue = 0; +} + +class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + Enc64 { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + + // encoding fields + bits<8> vaddr; + bits<8> vdata; + bits<8> vdst; + bits<1> slc; + bits<1> glc; + bits<1> tfe; + + // 15-0 is reserved. + let Inst{16} = !if(ps.has_glc, glc, ps.glcValue); + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x37; // Encoding. + let Inst{39-32} = vaddr; + let Inst{47-40} = !if(ps.has_data, vdata, ?); + // 54-48 is reserved. + let Inst{55} = tfe; + let Inst{63-56} = !if(ps.has_vdst, vdst, ?); +} + +class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo< + opName, + (outs regClass:$vdst), + (ins VReg_64:$vaddr, GLC:$glc, slc:$slc, tfe:$tfe), + " $vdst, $vaddr$glc$slc$tfe"> { + let has_data = 0; + let mayLoad = 1; +} + +class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass> : FLAT_Pseudo< + opName, + (outs), + (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc, tfe:$tfe), + " $vaddr, $vdata$glc$slc$tfe"> { + let mayLoad = 0; + let mayStore = 1; + let has_vdst = 0; +} + +multiclass FLAT_Atomic_Pseudo< + string opName, + RegisterClass vdst_rc, + ValueType vt, + SDPatternOperator atomic = null_frag, + ValueType data_vt = vt, + RegisterClass data_rc = vdst_rc> { + + def "" : FLAT_Pseudo <opName, + (outs), + (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe), + " $vaddr, $vdata$slc$tfe", + []>, + AtomicNoRet <NAME, 0> { + let mayLoad = 1; + let mayStore = 1; + let has_glc = 0; + let glcValue = 0; + let has_vdst = 0; + let PseudoInstr = NAME; + } + + def _RTN : FLAT_Pseudo <opName, + (outs vdst_rc:$vdst), + (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe), + " $vdst, $vaddr, $vdata glc$slc$tfe", + [(set vt:$vdst, + (atomic (FLATAtomic i64:$vaddr, i1:$slc, i1:$tfe), data_vt:$vdata))]>, + AtomicNoRet <NAME, 1> { + let mayLoad = 1; + let mayStore = 1; + let hasPostISelHook = 1; + let has_glc = 0; + let glcValue = 1; + let PseudoInstr = NAME # "_RTN"; + } +} + +class flat_binary_atomic_op<SDNode atomic_op> : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}] +>; + +def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>; +def atomic_swap_flat : flat_binary_atomic_op<atomic_swap>; +def atomic_add_flat : flat_binary_atomic_op<atomic_load_add>; +def atomic_and_flat : flat_binary_atomic_op<atomic_load_and>; +def atomic_max_flat : flat_binary_atomic_op<atomic_load_max>; +def atomic_min_flat : flat_binary_atomic_op<atomic_load_min>; +def atomic_or_flat : flat_binary_atomic_op<atomic_load_or>; +def atomic_sub_flat : flat_binary_atomic_op<atomic_load_sub>; +def atomic_umax_flat : flat_binary_atomic_op<atomic_load_umax>; +def atomic_umin_flat : flat_binary_atomic_op<atomic_load_umin>; +def atomic_xor_flat : flat_binary_atomic_op<atomic_load_xor>; +def atomic_inc_flat : flat_binary_atomic_op<SIatomic_inc>; +def atomic_dec_flat : flat_binary_atomic_op<SIatomic_dec>; + + + +//===----------------------------------------------------------------------===// +// Flat Instructions +//===----------------------------------------------------------------------===// + +def FLAT_LOAD_UBYTE : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>; +def FLAT_LOAD_SBYTE : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>; +def FLAT_LOAD_USHORT : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>; +def FLAT_LOAD_SSHORT : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>; +def FLAT_LOAD_DWORD : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>; +def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>; +def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>; +def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>; + +def FLAT_STORE_BYTE : FLAT_Store_Pseudo <"flat_store_byte", VGPR_32>; +def FLAT_STORE_SHORT : FLAT_Store_Pseudo <"flat_store_short", VGPR_32>; +def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>; +def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>; +def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>; +def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>; + +defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap", + VGPR_32, i32, atomic_cmp_swap_flat, + v2i32, VReg_64>; + +defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap_x2", + VReg_64, i64, atomic_cmp_swap_flat, + v2i64, VReg_128>; + +defm FLAT_ATOMIC_SWAP : FLAT_Atomic_Pseudo <"flat_atomic_swap", + VGPR_32, i32, atomic_swap_flat>; + +defm FLAT_ATOMIC_SWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_swap_x2", + VReg_64, i64, atomic_swap_flat>; + +defm FLAT_ATOMIC_ADD : FLAT_Atomic_Pseudo <"flat_atomic_add", + VGPR_32, i32, atomic_add_flat>; + +defm FLAT_ATOMIC_SUB : FLAT_Atomic_Pseudo <"flat_atomic_sub", + VGPR_32, i32, atomic_sub_flat>; + +defm FLAT_ATOMIC_SMIN : FLAT_Atomic_Pseudo <"flat_atomic_smin", + VGPR_32, i32, atomic_min_flat>; + +defm FLAT_ATOMIC_UMIN : FLAT_Atomic_Pseudo <"flat_atomic_umin", + VGPR_32, i32, atomic_umin_flat>; + +defm FLAT_ATOMIC_SMAX : FLAT_Atomic_Pseudo <"flat_atomic_smax", + VGPR_32, i32, atomic_max_flat>; + +defm FLAT_ATOMIC_UMAX : FLAT_Atomic_Pseudo <"flat_atomic_umax", + VGPR_32, i32, atomic_umax_flat>; + +defm FLAT_ATOMIC_AND : FLAT_Atomic_Pseudo <"flat_atomic_and", + VGPR_32, i32, atomic_and_flat>; + +defm FLAT_ATOMIC_OR : FLAT_Atomic_Pseudo <"flat_atomic_or", + VGPR_32, i32, atomic_or_flat>; + +defm FLAT_ATOMIC_XOR : FLAT_Atomic_Pseudo <"flat_atomic_xor", + VGPR_32, i32, atomic_xor_flat>; + +defm FLAT_ATOMIC_INC : FLAT_Atomic_Pseudo <"flat_atomic_inc", + VGPR_32, i32, atomic_inc_flat>; + +defm FLAT_ATOMIC_DEC : FLAT_Atomic_Pseudo <"flat_atomic_dec", + VGPR_32, i32, atomic_dec_flat>; + +defm FLAT_ATOMIC_ADD_X2 : FLAT_Atomic_Pseudo <"flat_atomic_add_x2", + VReg_64, i64, atomic_add_flat>; + +defm FLAT_ATOMIC_SUB_X2 : FLAT_Atomic_Pseudo <"flat_atomic_sub_x2", + VReg_64, i64, atomic_sub_flat>; + +defm FLAT_ATOMIC_SMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smin_x2", + VReg_64, i64, atomic_min_flat>; + +defm FLAT_ATOMIC_UMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umin_x2", + VReg_64, i64, atomic_umin_flat>; + +defm FLAT_ATOMIC_SMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smax_x2", + VReg_64, i64, atomic_max_flat>; + +defm FLAT_ATOMIC_UMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umax_x2", + VReg_64, i64, atomic_umax_flat>; + +defm FLAT_ATOMIC_AND_X2 : FLAT_Atomic_Pseudo <"flat_atomic_and_x2", + VReg_64, i64, atomic_and_flat>; + +defm FLAT_ATOMIC_OR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_or_x2", + VReg_64, i64, atomic_or_flat>; + +defm FLAT_ATOMIC_XOR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_xor_x2", + VReg_64, i64, atomic_xor_flat>; + +defm FLAT_ATOMIC_INC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2", + VReg_64, i64, atomic_inc_flat>; + +defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2", + VReg_64, i64, atomic_dec_flat>; + +let SubtargetPredicate = isCI in { // CI Only flat instructions : FIXME Only? + +defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap", + VGPR_32, f32, null_frag, v2f32, VReg_64>; + +defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2", + VReg_64, f64, null_frag, v2f64, VReg_128>; + +defm FLAT_ATOMIC_FMIN : FLAT_Atomic_Pseudo <"flat_atomic_fmin", + VGPR_32, f32>; + +defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax", + VGPR_32, f32>; + +defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2", + VReg_64, f64>; + +defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", + VReg_64, f64>; + +} // End SubtargetPredicate = isCI + +//===----------------------------------------------------------------------===// +// Flat Patterns +//===----------------------------------------------------------------------===// + +class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr), + (ld node:$ptr), [{ + auto const AS = cast<MemSDNode>(N)->getAddressSpace(); + return AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; +}]>; + +class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + auto const AS = cast<MemSDNode>(N)->getAddressSpace(); + return AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::GLOBAL_ADDRESS; +}]>; + +def atomic_flat_load : flat_ld <atomic_load>; +def flat_load : flat_ld <load>; +def flat_az_extloadi8 : flat_ld <az_extloadi8>; +def flat_sextloadi8 : flat_ld <sextloadi8>; +def flat_az_extloadi16 : flat_ld <az_extloadi16>; +def flat_sextloadi16 : flat_ld <sextloadi16>; + +def atomic_flat_store : flat_st <atomic_store>; +def flat_store : flat_st <store>; +def flat_truncstorei8 : flat_st <truncstorei8>; +def flat_truncstorei16 : flat_st <truncstorei16>; + +// Patterns for global loads with no offset. +class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr)), + (inst $addr, 0, 0, 0) +>; + +class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr)), + (inst $addr, 1, 0, 0) +>; + +class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < + (node vt:$data, i64:$addr), + (inst $addr, $data, 0, 0, 0) +>; + +class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < + // atomic store follows atomic binop convention so the address comes + // first. + (node i64:$addr, vt:$data), + (inst $addr, $data, 1, 0, 0) +>; + +class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, + ValueType data_vt = vt> : Pat < + (vt (node i64:$addr, data_vt:$data)), + (inst $addr, $data, 0, 0) +>; + +let Predicates = [isCIVI] in { + +def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>; +def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i16>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i16>; +def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>; +def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>; + +def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_flat_load, i32>; +def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_flat_load, i64>; + +def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>; +def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>; +def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>; +def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>; +def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>; + +def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_flat_store, i32>; +def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_flat_store, i64>; + +def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; + +def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; + +} // End Predicates = [isCIVI] + +let Predicates = [isVI] in { + def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i16>; + def : FlatStorePat <FLAT_STORE_SHORT, flat_store, i16>; +} + + +//===----------------------------------------------------------------------===// +// Target +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// CI +//===----------------------------------------------------------------------===// + +class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps> : + FLAT_Real <op, ps>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SI> { + let AssemblerPredicate = isCIOnly; + let DecoderNamespace="CI"; +} + +def FLAT_LOAD_UBYTE_ci : FLAT_Real_ci <0x8, FLAT_LOAD_UBYTE>; +def FLAT_LOAD_SBYTE_ci : FLAT_Real_ci <0x9, FLAT_LOAD_SBYTE>; +def FLAT_LOAD_USHORT_ci : FLAT_Real_ci <0xa, FLAT_LOAD_USHORT>; +def FLAT_LOAD_SSHORT_ci : FLAT_Real_ci <0xb, FLAT_LOAD_SSHORT>; +def FLAT_LOAD_DWORD_ci : FLAT_Real_ci <0xc, FLAT_LOAD_DWORD>; +def FLAT_LOAD_DWORDX2_ci : FLAT_Real_ci <0xd, FLAT_LOAD_DWORDX2>; +def FLAT_LOAD_DWORDX4_ci : FLAT_Real_ci <0xe, FLAT_LOAD_DWORDX4>; +def FLAT_LOAD_DWORDX3_ci : FLAT_Real_ci <0xf, FLAT_LOAD_DWORDX3>; + +def FLAT_STORE_BYTE_ci : FLAT_Real_ci <0x18, FLAT_STORE_BYTE>; +def FLAT_STORE_SHORT_ci : FLAT_Real_ci <0x1a, FLAT_STORE_SHORT>; +def FLAT_STORE_DWORD_ci : FLAT_Real_ci <0x1c, FLAT_STORE_DWORD>; +def FLAT_STORE_DWORDX2_ci : FLAT_Real_ci <0x1d, FLAT_STORE_DWORDX2>; +def FLAT_STORE_DWORDX4_ci : FLAT_Real_ci <0x1e, FLAT_STORE_DWORDX4>; +def FLAT_STORE_DWORDX3_ci : FLAT_Real_ci <0x1f, FLAT_STORE_DWORDX3>; + +multiclass FLAT_Real_Atomics_ci <bits<7> op, FLAT_Pseudo ps> { + def _ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>; + def _RTN_ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>; +} + +defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_ci <0x30, FLAT_ATOMIC_SWAP>; +defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_ci <0x31, FLAT_ATOMIC_CMPSWAP>; +defm FLAT_ATOMIC_ADD : FLAT_Real_Atomics_ci <0x32, FLAT_ATOMIC_ADD>; +defm FLAT_ATOMIC_SUB : FLAT_Real_Atomics_ci <0x33, FLAT_ATOMIC_SUB>; +defm FLAT_ATOMIC_SMIN : FLAT_Real_Atomics_ci <0x35, FLAT_ATOMIC_SMIN>; +defm FLAT_ATOMIC_UMIN : FLAT_Real_Atomics_ci <0x36, FLAT_ATOMIC_UMIN>; +defm FLAT_ATOMIC_SMAX : FLAT_Real_Atomics_ci <0x37, FLAT_ATOMIC_SMAX>; +defm FLAT_ATOMIC_UMAX : FLAT_Real_Atomics_ci <0x38, FLAT_ATOMIC_UMAX>; +defm FLAT_ATOMIC_AND : FLAT_Real_Atomics_ci <0x39, FLAT_ATOMIC_AND>; +defm FLAT_ATOMIC_OR : FLAT_Real_Atomics_ci <0x3a, FLAT_ATOMIC_OR>; +defm FLAT_ATOMIC_XOR : FLAT_Real_Atomics_ci <0x3b, FLAT_ATOMIC_XOR>; +defm FLAT_ATOMIC_INC : FLAT_Real_Atomics_ci <0x3c, FLAT_ATOMIC_INC>; +defm FLAT_ATOMIC_DEC : FLAT_Real_Atomics_ci <0x3d, FLAT_ATOMIC_DEC>; +defm FLAT_ATOMIC_SWAP_X2 : FLAT_Real_Atomics_ci <0x50, FLAT_ATOMIC_SWAP_X2>; +defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_ci <0x51, FLAT_ATOMIC_CMPSWAP_X2>; +defm FLAT_ATOMIC_ADD_X2 : FLAT_Real_Atomics_ci <0x52, FLAT_ATOMIC_ADD_X2>; +defm FLAT_ATOMIC_SUB_X2 : FLAT_Real_Atomics_ci <0x53, FLAT_ATOMIC_SUB_X2>; +defm FLAT_ATOMIC_SMIN_X2 : FLAT_Real_Atomics_ci <0x55, FLAT_ATOMIC_SMIN_X2>; +defm FLAT_ATOMIC_UMIN_X2 : FLAT_Real_Atomics_ci <0x56, FLAT_ATOMIC_UMIN_X2>; +defm FLAT_ATOMIC_SMAX_X2 : FLAT_Real_Atomics_ci <0x57, FLAT_ATOMIC_SMAX_X2>; +defm FLAT_ATOMIC_UMAX_X2 : FLAT_Real_Atomics_ci <0x58, FLAT_ATOMIC_UMAX_X2>; +defm FLAT_ATOMIC_AND_X2 : FLAT_Real_Atomics_ci <0x59, FLAT_ATOMIC_AND_X2>; +defm FLAT_ATOMIC_OR_X2 : FLAT_Real_Atomics_ci <0x5a, FLAT_ATOMIC_OR_X2>; +defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_ci <0x5b, FLAT_ATOMIC_XOR_X2>; +defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_ci <0x5c, FLAT_ATOMIC_INC_X2>; +defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_ci <0x5d, FLAT_ATOMIC_DEC_X2>; + +// CI Only flat instructions +defm FLAT_ATOMIC_FCMPSWAP : FLAT_Real_Atomics_ci <0x3e, FLAT_ATOMIC_FCMPSWAP>; +defm FLAT_ATOMIC_FMIN : FLAT_Real_Atomics_ci <0x3f, FLAT_ATOMIC_FMIN>; +defm FLAT_ATOMIC_FMAX : FLAT_Real_Atomics_ci <0x40, FLAT_ATOMIC_FMAX>; +defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_ci <0x5e, FLAT_ATOMIC_FCMPSWAP_X2>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_ci <0x5f, FLAT_ATOMIC_FMIN_X2>; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, FLAT_ATOMIC_FMAX_X2>; + + +//===----------------------------------------------------------------------===// +// VI +//===----------------------------------------------------------------------===// + +class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> : + FLAT_Real <op, ps>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { + let AssemblerPredicate = isVI; + let DecoderNamespace="VI"; +} + +def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>; +def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>; +def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>; +def FLAT_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, FLAT_LOAD_SSHORT>; +def FLAT_LOAD_DWORD_vi : FLAT_Real_vi <0x14, FLAT_LOAD_DWORD>; +def FLAT_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, FLAT_LOAD_DWORDX2>; +def FLAT_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>; +def FLAT_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>; + +def FLAT_STORE_BYTE_vi : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>; +def FLAT_STORE_SHORT_vi : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>; +def FLAT_STORE_DWORD_vi : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>; +def FLAT_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>; +def FLAT_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>; +def FLAT_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>; + +multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps> { + def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>; + def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>; +} + +defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_vi <0x40, FLAT_ATOMIC_SWAP>; +defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_vi <0x41, FLAT_ATOMIC_CMPSWAP>; +defm FLAT_ATOMIC_ADD : FLAT_Real_Atomics_vi <0x42, FLAT_ATOMIC_ADD>; +defm FLAT_ATOMIC_SUB : FLAT_Real_Atomics_vi <0x43, FLAT_ATOMIC_SUB>; +defm FLAT_ATOMIC_SMIN : FLAT_Real_Atomics_vi <0x44, FLAT_ATOMIC_SMIN>; +defm FLAT_ATOMIC_UMIN : FLAT_Real_Atomics_vi <0x45, FLAT_ATOMIC_UMIN>; +defm FLAT_ATOMIC_SMAX : FLAT_Real_Atomics_vi <0x46, FLAT_ATOMIC_SMAX>; +defm FLAT_ATOMIC_UMAX : FLAT_Real_Atomics_vi <0x47, FLAT_ATOMIC_UMAX>; +defm FLAT_ATOMIC_AND : FLAT_Real_Atomics_vi <0x48, FLAT_ATOMIC_AND>; +defm FLAT_ATOMIC_OR : FLAT_Real_Atomics_vi <0x49, FLAT_ATOMIC_OR>; +defm FLAT_ATOMIC_XOR : FLAT_Real_Atomics_vi <0x4a, FLAT_ATOMIC_XOR>; +defm FLAT_ATOMIC_INC : FLAT_Real_Atomics_vi <0x4b, FLAT_ATOMIC_INC>; +defm FLAT_ATOMIC_DEC : FLAT_Real_Atomics_vi <0x4c, FLAT_ATOMIC_DEC>; +defm FLAT_ATOMIC_SWAP_X2 : FLAT_Real_Atomics_vi <0x60, FLAT_ATOMIC_SWAP_X2>; +defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_vi <0x61, FLAT_ATOMIC_CMPSWAP_X2>; +defm FLAT_ATOMIC_ADD_X2 : FLAT_Real_Atomics_vi <0x62, FLAT_ATOMIC_ADD_X2>; +defm FLAT_ATOMIC_SUB_X2 : FLAT_Real_Atomics_vi <0x63, FLAT_ATOMIC_SUB_X2>; +defm FLAT_ATOMIC_SMIN_X2 : FLAT_Real_Atomics_vi <0x64, FLAT_ATOMIC_SMIN_X2>; +defm FLAT_ATOMIC_UMIN_X2 : FLAT_Real_Atomics_vi <0x65, FLAT_ATOMIC_UMIN_X2>; +defm FLAT_ATOMIC_SMAX_X2 : FLAT_Real_Atomics_vi <0x66, FLAT_ATOMIC_SMAX_X2>; +defm FLAT_ATOMIC_UMAX_X2 : FLAT_Real_Atomics_vi <0x67, FLAT_ATOMIC_UMAX_X2>; +defm FLAT_ATOMIC_AND_X2 : FLAT_Real_Atomics_vi <0x68, FLAT_ATOMIC_AND_X2>; +defm FLAT_ATOMIC_OR_X2 : FLAT_Real_Atomics_vi <0x69, FLAT_ATOMIC_OR_X2>; +defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>; +defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>; +defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>; + diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 29b1f79..dd3b46f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -38,6 +38,33 @@ void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { CurrCycleInstr = MI; } +static bool isDivFMas(unsigned Opcode) { + return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64; +} + +static bool isSGetReg(unsigned Opcode) { + return Opcode == AMDGPU::S_GETREG_B32; +} + +static bool isSSetReg(unsigned Opcode) { + return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32; +} + +static bool isRWLane(unsigned Opcode) { + return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; +} + +static bool isRFE(unsigned Opcode) { + return Opcode == AMDGPU::S_RFE_B64; +} + +static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { + + const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, + AMDGPU::OpName::simm16); + return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; +} + ScheduleHazardRecognizer::HazardType GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { MachineInstr *MI = SU->getInstr(); @@ -48,9 +75,27 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0) return NoopHazard; + if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) + return NoopHazard; + if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) return NoopHazard; + if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) + return NoopHazard; + + if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) + return NoopHazard; + + if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) + return NoopHazard; + + if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) + return NoopHazard; + + if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) + return NoopHazard; + return NoHazard; } @@ -62,11 +107,32 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { if (SIInstrInfo::isSMRD(*MI)) return std::max(0, checkSMRDHazards(MI)); - if (SIInstrInfo::isVMEM(*MI)) - return std::max(0, checkVMEMHazards(MI)); + if (SIInstrInfo::isVALU(*MI)) { + int WaitStates = std::max(0, checkVALUHazards(MI)); - if (SIInstrInfo::isDPP(*MI)) - return std::max(0, checkDPPHazards(MI)); + if (SIInstrInfo::isVMEM(*MI)) + WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); + + if (SIInstrInfo::isDPP(*MI)) + WaitStates = std::max(WaitStates, checkDPPHazards(MI)); + + if (isDivFMas(MI->getOpcode())) + WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); + + if (isRWLane(MI->getOpcode())) + WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); + + return WaitStates; + } + + if (isSGetReg(MI->getOpcode())) + return std::max(0, checkGetRegHazards(MI)); + + if (isSSetReg(MI->getOpcode())) + return std::max(0, checkSetRegHazards(MI)); + + if (isRFE(MI->getOpcode())) + return std::max(0, checkRFEHazards(MI)); return 0; } @@ -112,21 +178,40 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -int GCNHazardRecognizer::getWaitStatesSinceDef( - unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) { - const SIRegisterInfo *TRI = ST.getRegisterInfo(); +int GCNHazardRecognizer::getWaitStatesSince( + function_ref<bool(MachineInstr *)> IsHazard) { int WaitStates = -1; for (MachineInstr *MI : EmittedInstrs) { ++WaitStates; - if (!MI || !IsHazardDef(MI)) + if (!MI || !IsHazard(MI)) continue; - if (MI->modifiesRegister(Reg, TRI)) - return WaitStates; + return WaitStates; } return std::numeric_limits<int>::max(); } +int GCNHazardRecognizer::getWaitStatesSinceDef( + unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { + return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); + }; + + return getWaitStatesSince(IsHazardFn); +} + +int GCNHazardRecognizer::getWaitStatesSinceSetReg( + function_ref<bool(MachineInstr *)> IsHazard) { + + auto IsHazardFn = [IsHazard] (MachineInstr *MI) { + return isSSetReg(MI->getOpcode()) && IsHazard(MI); + }; + + return getWaitStatesSince(IsHazardFn); +} + //===----------------------------------------------------------------------===// // No-op Hazard Detection //===----------------------------------------------------------------------===// @@ -262,3 +347,156 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { return WaitStatesNeeded; } + +int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { + const SIInstrInfo *TII = ST.getInstrInfo(); + + // v_div_fmas requires 4 wait states after a write to vcc from a VALU + // instruction. + const int DivFMasWaitStates = 4; + auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn); + + return DivFMasWaitStates - WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { + const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); + + const int GetRegWaitStates = 2; + auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { + return GetRegHWReg == getHWReg(TII, *MI); + }; + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + + return GetRegWaitStates - WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { + const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned HWReg = getHWReg(TII, *SetRegInstr); + + const int SetRegWaitStates = + ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ? 1 : 2; + auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { + return HWReg == getHWReg(TII, *MI); + }; + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + return SetRegWaitStates - WaitStatesNeeded; +} + +int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { + if (!MI.mayStore()) + return -1; + + const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MI.getDesc(); + + int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); + int VDataRCID = -1; + if (VDataIdx != -1) + VDataRCID = Desc.OpInfo[VDataIdx].RegClass; + + if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { + // There is no hazard if the instruction does not use vector regs + // (like wbinvl1) + if (VDataIdx == -1) + return -1; + // For MUBUF/MTBUF instructions this hazard only exists if the + // instruction is not using a register in the soffset field. + const MachineOperand *SOffset = + TII->getNamedOperand(MI, AMDGPU::OpName::soffset); + // If we have no soffset operand, then assume this field has been + // hardcoded to zero. + if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && + (!SOffset || !SOffset->isReg())) + return VDataIdx; + } + + // MIMG instructions create a hazard if they don't use a 256-bit T# and + // the store size is greater than 8 bytes and they have more than two bits + // of their dmask set. + // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. + if (TII->isMIMG(MI)) { + int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); + assert(SRsrcIdx != -1 && + AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256); + (void)SRsrcIdx; + } + + if (TII->isFLAT(MI)) { + int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); + if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64) + return DataIdx; + } + + return -1; +} + +int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { + // This checks for the hazard where VMEM instructions that store more than + // 8 bytes can have there store data over written by the next instruction. + if (!ST.has12DWordStoreHazard()) + return 0; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = VALU->getParent()->getParent()->getRegInfo(); + + const int VALUWaitStates = 1; + int WaitStatesNeeded = 0; + + for (const MachineOperand &Def : VALU->defs()) { + if (!TRI->isVGPR(MRI, Def.getReg())) + continue; + unsigned Reg = Def.getReg(); + auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { + int DataIdx = createsVALUHazard(*MI); + return DataIdx >= 0 && + TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); + }; + int WaitStatesNeededForDef = + VALUWaitStates - getWaitStatesSince(IsHazardFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = + RWLane->getParent()->getParent()->getRegInfo(); + + const MachineOperand *LaneSelectOp = + TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); + + if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) + return 0; + + unsigned LaneSelectReg = LaneSelectOp->getReg(); + auto IsHazardFn = [TII] (MachineInstr *MI) { + return TII->isVALU(*MI); + }; + + const int RWLaneWaitStates = 4; + int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn); + return RWLaneWaitStates - WaitStatesSince; +} + +int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { + + if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + return 0; + + const SIInstrInfo *TII = ST.getInstrInfo(); + + const int RFEWaitStates = 1; + + auto IsHazardFn = [TII] (MachineInstr *MI) { + return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; + }; + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + return RFEWaitStates - WaitStatesNeeded; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index d82041c..0ab82ff 100644 --- a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -35,14 +35,23 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { const MachineFunction &MF; const SISubtarget &ST; + int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard); int getWaitStatesSinceDef(unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef = [](MachineInstr *) { return true; }); + int getWaitStatesSinceSetReg(function_ref<bool(MachineInstr *)> IsHazard); int checkSMEMSoftClauseHazards(MachineInstr *SMEM); int checkSMRDHazards(MachineInstr *SMRD); int checkVMEMHazards(MachineInstr* VMEM); int checkDPPHazards(MachineInstr *DPP); + int checkDivFMasHazards(MachineInstr *DivFMas); + int checkGetRegHazards(MachineInstr *GetRegInstr); + int checkSetRegHazards(MachineInstr *SetRegInstr); + int createsVALUHazard(const MachineInstr &MI); + int checkVALUHazards(MachineInstr *VALU); + int checkRWLaneHazards(MachineInstr *RWLane); + int checkRFEHazards(MachineInstr *RFE); public: GCNHazardRecognizer(const MachineFunction &MF); // We can only issue one instruction per cycle. diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp new file mode 100644 index 0000000..2f88033 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -0,0 +1,312 @@ +//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This contains a MachineSchedStrategy implementation for maximizing wave +/// occupancy on GCN hardware. +//===----------------------------------------------------------------------===// + +#include "GCNSchedStrategy.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" + +#define DEBUG_TYPE "misched" + +using namespace llvm; + +GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( + const MachineSchedContext *C) : + GenericScheduler(C) { } + +static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs, + const MachineFunction &MF) { + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs), + ST.getOccupancyWithNumVGPRs(VGPRs)); + return std::min(MinRegOccupancy, + ST.getOccupancyWithLocalMemSize(MFI->getLDSSize())); +} + +void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, + bool AtTop, const RegPressureTracker &RPTracker, + const SIRegisterInfo *SRI, + int SGPRPressure, + int VGPRPressure, + int SGPRExcessLimit, + int VGPRExcessLimit, + int SGPRCriticalLimit, + int VGPRCriticalLimit) { + + Cand.SU = SU; + Cand.AtTop = AtTop; + + // getDownwardPressure() and getUpwardPressure() make temporary changes to + // the the tracker, so we need to pass those function a non-const copy. + RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); + + std::vector<unsigned> Pressure; + std::vector<unsigned> MaxPressure; + + if (AtTop) + TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); + else { + // FIXME: I think for bottom up scheduling, the register pressure is cached + // and can be retrieved by DAG->getPressureDif(SU). + TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); + } + + int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()]; + int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()]; + + // If two instructions increase the pressure of different register sets + // by the same amount, the generic scheduler will prefer to schedule the + // instruction that increases the set with the least amount of registers, + // which in our case would be SGPRs. This is rarely what we want, so + // when we report excess/critical register pressure, we do it either + // only for VGPRs or only for SGPRs. + + // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. + const int MaxVGPRPressureInc = 16; + bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; + bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; + + + // FIXME: We have to enter REG-EXCESS before we reach the actual threshold + // to increase the likelihood we don't go over the limits. We should improve + // the analysis to look through dependencies to find the path with the least + // register pressure. + // FIXME: This is also necessary, because some passes that run after + // scheduling and before regalloc increase register pressure. + const int ErrorMargin = 3; + VGPRExcessLimit -= ErrorMargin; + SGPRExcessLimit -= ErrorMargin; + + // We only need to update the RPDelata for instructions that increase + // register pressure. Instructions that decrease or keep reg pressure + // the same will be marked as RegExcess in tryCandidate() when they + // are compared with instructions that increase the register pressure. + if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { + Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet()); + Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); + } + + if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { + Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet()); + Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure = SGPRExcessLimit); + } + + // Register pressure is considered 'CRITICAL' if it is approaching a value + // that would reduce the wave occupancy for the execution unit. When + // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both + // has the same cost, so we don't need to prefer one over the other. + + VGPRCriticalLimit -= ErrorMargin; + SGPRCriticalLimit -= ErrorMargin; + + int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; + int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; + + if (SGPRDelta >= 0 || VGPRDelta >= 0) { + if (SGPRDelta > VGPRDelta) { + Cand.RPDelta.CriticalMax = PressureChange(SRI->getSGPRPressureSet()); + Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); + } else { + Cand.RPDelta.CriticalMax = PressureChange(SRI->getVGPRPressureSet()); + Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); + } + } +} + +// This function is mostly cut and pasted from +// GenericScheduler::pickNodeFromQueue() +void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, + const CandPolicy &ZonePolicy, + const RegPressureTracker &RPTracker, + SchedCandidate &Cand) { + const SISubtarget &ST = DAG->MF.getSubtarget<SISubtarget>(); + const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); + ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); + unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()]; + unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()]; + unsigned SGPRExcessLimit = + Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); + unsigned VGPRExcessLimit = + Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); + unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF); + unsigned SGPRCriticalLimit = SRI->getMaxNumSGPRs(ST, MaxWaves, true); + unsigned VGPRCriticalLimit = SRI->getMaxNumVGPRs(MaxWaves); + + ReadyQueue &Q = Zone.Available; + for (SUnit *SU : Q) { + + SchedCandidate TryCand(ZonePolicy); + initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, + SGPRPressure, VGPRPressure, + SGPRExcessLimit, VGPRExcessLimit, + SGPRCriticalLimit, VGPRCriticalLimit); + // Pass SchedBoundary only when comparing nodes from the same boundary. + SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; + GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg); + if (TryCand.Reason != NoCand) { + // Initialize resource delta if needed in case future heuristics query it. + if (TryCand.ResDelta == SchedResourceDelta()) + TryCand.initResourceDelta(Zone.DAG, SchedModel); + Cand.setBest(TryCand); + } + } +} + +static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) { + switch (Reason) { + default: + return Reason; + case GenericSchedulerBase::RegCritical: + case GenericSchedulerBase::RegExcess: + return -Reason; + } +} + +// This function is mostly cut and pasted from +// GenericScheduler::pickNodeBidirectional() +SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { + // Schedule as far as possible in the direction of no choice. This is most + // efficient, but also provides the best heuristics for CriticalPSets. + if (SUnit *SU = Bot.pickOnlyChoice()) { + IsTopNode = false; + return SU; + } + if (SUnit *SU = Top.pickOnlyChoice()) { + IsTopNode = true; + return SU; + } + // Set the bottom-up policy based on the state of the current bottom zone and + // the instructions outside the zone, including the top zone. + CandPolicy BotPolicy; + setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); + // Set the top-down policy based on the state of the current top zone and + // the instructions outside the zone, including the bottom zone. + CandPolicy TopPolicy; + setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); + + // See if BotCand is still valid (because we previously scheduled from Top). + DEBUG(dbgs() << "Picking from Bot:\n"); + if (!BotCand.isValid() || BotCand.SU->isScheduled || + BotCand.Policy != BotPolicy) { + BotCand.reset(CandPolicy()); + pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); + assert(BotCand.Reason != NoCand && "failed to find the first candidate"); + } else { + DEBUG(traceCandidate(BotCand)); + } + + // Check if the top Q has a better candidate. + DEBUG(dbgs() << "Picking from Top:\n"); + if (!TopCand.isValid() || TopCand.SU->isScheduled || + TopCand.Policy != TopPolicy) { + TopCand.reset(CandPolicy()); + pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); + assert(TopCand.Reason != NoCand && "failed to find the first candidate"); + } else { + DEBUG(traceCandidate(TopCand)); + } + + // Pick best from BotCand and TopCand. + DEBUG( + dbgs() << "Top Cand: "; + traceCandidate(BotCand); + dbgs() << "Bot Cand: "; + traceCandidate(TopCand); + ); + SchedCandidate Cand; + if (TopCand.Reason == BotCand.Reason) { + Cand = BotCand; + GenericSchedulerBase::CandReason TopReason = TopCand.Reason; + TopCand.Reason = NoCand; + GenericScheduler::tryCandidate(Cand, TopCand, nullptr); + if (TopCand.Reason != NoCand) { + Cand.setBest(TopCand); + } else { + TopCand.Reason = TopReason; + } + } else { + if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) { + Cand = TopCand; + } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) { + Cand = BotCand; + } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) { + Cand = TopCand; + } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) { + Cand = BotCand; + } else { + int TopRank = getBidirectionalReasonRank(TopCand.Reason); + int BotRank = getBidirectionalReasonRank(BotCand.Reason); + if (TopRank > BotRank) { + Cand = TopCand; + } else { + Cand = BotCand; + } + } + } + DEBUG( + dbgs() << "Picking: "; + traceCandidate(Cand); + ); + + IsTopNode = Cand.AtTop; + return Cand.SU; +} + +// This function is mostly cut and pasted from +// GenericScheduler::pickNode() +SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { + if (DAG->top() == DAG->bottom()) { + assert(Top.Available.empty() && Top.Pending.empty() && + Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); + return nullptr; + } + SUnit *SU; + do { + if (RegionPolicy.OnlyTopDown) { + SU = Top.pickOnlyChoice(); + if (!SU) { + CandPolicy NoPolicy; + TopCand.reset(NoPolicy); + pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); + assert(TopCand.Reason != NoCand && "failed to find a candidate"); + SU = TopCand.SU; + } + IsTopNode = true; + } else if (RegionPolicy.OnlyBottomUp) { + SU = Bot.pickOnlyChoice(); + if (!SU) { + CandPolicy NoPolicy; + BotCand.reset(NoPolicy); + pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); + assert(BotCand.Reason != NoCand && "failed to find a candidate"); + SU = BotCand.SU; + } + IsTopNode = false; + } else { + SU = pickNodeBidirectional(IsTopNode); + } + } while (SU->isScheduled); + + if (SU->isTopReady()) + Top.removeReady(SU); + if (SU->isBottomReady()) + Bot.removeReady(SU); + + DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); + return SU; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h new file mode 100644 index 0000000..4cfc0ce --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -0,0 +1,54 @@ +//===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H +#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class SIRegisterInfo; + +/// This is a minimal scheduler strategy. The main difference between this +/// and the GenericScheduler is that GCNSchedStrategy uses different +/// heuristics to determine excess/critical pressure sets. Its goal is to +/// maximize kernel occupancy (i.e. maximum number of waves per simd). +class GCNMaxOccupancySchedStrategy : public GenericScheduler { + + SUnit *pickNodeBidirectional(bool &IsTopNode); + + void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, + const RegPressureTracker &RPTracker, + SchedCandidate &Cand); + + void initCandidate(SchedCandidate &Cand, SUnit *SU, + bool AtTop, const RegPressureTracker &RPTracker, + const SIRegisterInfo *SRI, + int SGPRPressure, int VGPRPressure, + int SGPRExcessLimit, int VGPRExcessLimit, + int SGPRCriticalLimit, int VGPRCriticalLimit); + + void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone, const SIRegisterInfo *SRI, + unsigned SGPRPressure, unsigned VGPRPressure); + +public: + GCNMaxOccupancySchedStrategy(const MachineSchedContext *C); + + SUnit *pickNode(bool &IsTopNode) override; +}; + +} // End namespace llvm + +#endif // GCNSCHEDSTRATEGY_H diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index 2932d3b..7172a0a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -9,46 +9,52 @@ //===----------------------------------------------------------------------===// #include "AMDGPUInstPrinter.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUAsmUtils.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" - -#include <string> +#include <cassert> using namespace llvm; +using namespace llvm::AMDGPU; void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, const MCSubtargetInfo &STI) { OS.flush(); - printInstruction(MI, OS); - + printInstruction(MI, STI, OS); printAnnotation(OS, Annot); } void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, + raw_ostream &O) { O << formatHex(MI->getOperand(OpNo).getImm() & 0xf); } void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + raw_ostream &O) { O << formatHex(MI->getOperand(OpNo).getImm() & 0xff); } void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff); -} - -void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); + // It's possible to end up with a 32-bit literal used with a 16-bit operand + // with ignored high bits. Print as 32-bit anyway in that case. + int64_t Imm = MI->getOperand(OpNo).getImm(); + if (isInt<16>(Imm) || isUInt<16>(Imm)) + O << formatHex(static_cast<uint64_t>(Imm & 0xffff)); + else + printU32ImmOperand(MI, OpNo, STI, O); } void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, @@ -66,8 +72,14 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); } -void AMDGPUInstPrinter::printNamedBit(const MCInst* MI, unsigned OpNo, - raw_ostream& O, StringRef BitName) { +void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); +} + +void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo, + raw_ostream &O, StringRef BitName) { if (MI->getOperand(OpNo).getImm()) { O << ' ' << BitName; } @@ -97,7 +109,8 @@ void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, + raw_ostream &O) { uint16_t Imm = MI->getOperand(OpNo).getImm(); if (Imm != 0) { O << " offset:"; @@ -106,7 +119,8 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, + raw_ostream &O) { if (MI->getOperand(OpNo).getImm()) { O << " offset0:"; printU8ImmDecOperand(MI, OpNo, O); @@ -114,74 +128,97 @@ void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, + raw_ostream &O) { if (MI->getOperand(OpNo).getImm()) { O << " offset1:"; printU8ImmDecOperand(MI, OpNo, O); } } -void AMDGPUInstPrinter::printSMRDOffset(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printU32ImmOperand(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printSMRDOffset20(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { - printU32ImmOperand(MI, OpNo, O); + printU32ImmOperand(MI, OpNo, STI, O); } void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { - printU32ImmOperand(MI, OpNo, O); + printU32ImmOperand(MI, OpNo, STI, O); } void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "gds"); } void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "glc"); } void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "slc"); } void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "tfe"); } void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { if (MI->getOperand(OpNo).getImm()) { O << " dmask:"; - printU16ImmOperand(MI, OpNo, O); + printU16ImmOperand(MI, OpNo, STI, O); } } void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "unorm"); } void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "da"); } void AMDGPUInstPrinter::printR128(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "r128"); } void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "lwe"); } -void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, +void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " compr"; +} + +void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " vm"; +} + +void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI) { - switch (reg) { + switch (RegNo) { case AMDGPU::VCC: O << "vcc"; return; @@ -233,52 +270,54 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, // The low 8 bits of the encoding value is the register index, for both VGPRs // and SGPRs. - unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); + unsigned RegIdx = MRI.getEncodingValue(RegNo) & ((1 << 8) - 1); unsigned NumRegs; - if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) { + if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(RegNo)) { O << 'v'; NumRegs = 1; - } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) { + } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(RegNo)) { O << 's'; NumRegs = 1; - } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) { + } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo)) { O <<'v'; NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(reg)) { + } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo)) { O << 's'; NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) { + } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo)) { O << 'v'; NumRegs = 4; - } else if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(reg)) { + } else if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo)) { O << 's'; NumRegs = 4; - } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) { + } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo)) { O << 'v'; NumRegs = 3; - } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) { + } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) { O << 'v'; NumRegs = 8; - } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) { + } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(RegNo)) { O << 's'; NumRegs = 8; - } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) { + } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo)) { O << 'v'; NumRegs = 16; - } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) { + } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(RegNo)) { O << 's'; NumRegs = 16; - } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(reg)) { + } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(RegNo)) { O << "ttmp"; NumRegs = 2; - RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen. - } else if (MRI.getRegClass(AMDGPU::TTMP_128RegClassID).contains(reg)) { + // Trap temps start at offset 112. TODO: Get this from tablegen. + RegIdx -= 112; + } else if (MRI.getRegClass(AMDGPU::TTMP_128RegClassID).contains(RegNo)) { O << "ttmp"; NumRegs = 4; - RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen. + // Trap temps start at offset 112. TODO: Get this from tablegen. + RegIdx -= 112; } else { - O << getRegisterName(reg); + O << getRegisterName(RegNo); return; } @@ -291,7 +330,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) O << "_e64 "; else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP) @@ -301,10 +340,44 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, else O << "_e32 "; - printOperand(MI, OpNo, O); + printOperand(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int16_t SImm = static_cast<int16_t>(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; + return; + } + + if (Imm == 0x3C00) + O<< "1.0"; + else if (Imm == 0xBC00) + O<< "-1.0"; + else if (Imm == 0x3800) + O<< "0.5"; + else if (Imm == 0xB800) + O<< "-0.5"; + else if (Imm == 0x4000) + O<< "2.0"; + else if (Imm == 0xC000) + O<< "-2.0"; + else if (Imm == 0x4400) + O<< "4.0"; + else if (Imm == 0xC400) + O<< "-4.0"; + else if (Imm == 0x3118) { + assert(STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]); + O << "0.15915494"; + } else + O << formatHex(static_cast<uint64_t>(Imm)); } -void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) { +void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { int32_t SImm = static_cast<int32_t>(Imm); if (SImm >= -16 && SImm <= 64) { O << SImm; @@ -329,11 +402,16 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) { O << "4.0"; else if (Imm == FloatToBits(-4.0f)) O << "-4.0"; + else if (Imm == 0x3e22f983 && + STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) + O << "0.15915494"; else O << formatHex(static_cast<uint64_t>(Imm)); } -void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { +void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { int64_t SImm = static_cast<int64_t>(Imm); if (SImm >= -16 && SImm <= 64) { O << SImm; @@ -358,8 +436,11 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { O << "4.0"; else if (Imm == DoubleToBits(-4.0)) O << "-4.0"; + else if (Imm == 0x3fc45f306dc9c882 && + STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) + O << "0.15915494"; else { - assert(isUInt<32>(Imm)); + assert(isUInt<32>(Imm) || Imm == 0x3fc45f306dc9c882); // In rare situations, we will have a 32-bit literal in a 64-bit // operand. This is technically allowed for the encoding of s_mov_b64. @@ -368,7 +449,12 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { } void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + if (OpNo >= MI->getNumOperands()) { + O << "/*Missing OP" << OpNo << "*/"; + return; + } const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { @@ -383,22 +469,39 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } } else if (Op.isImm()) { const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - int RCID = Desc.OpInfo[OpNo].RegClass; - if (RCID != -1) { - const MCRegisterClass &ImmRC = MRI.getRegClass(RCID); - if (ImmRC.getSize() == 4) - printImmediate32(Op.getImm(), O); - else if (ImmRC.getSize() == 8) - printImmediate64(Op.getImm(), O); - else - llvm_unreachable("Invalid register class size"); - } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) { - printImmediate32(Op.getImm(), O); - } else { + switch (Desc.OpInfo[OpNo].OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case MCOI::OPERAND_IMMEDIATE: + printImmediate32(Op.getImm(), STI, O); + break; + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + printImmediate64(Op.getImm(), STI, O); + break; + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + printImmediate16(Op.getImm(), STI, O); + break; + case MCOI::OPERAND_UNKNOWN: + case MCOI::OPERAND_PCREL: + O << formatDec(Op.getImm()); + break; + case MCOI::OPERAND_REGISTER: + // FIXME: This should be removed and handled somewhere else. Seems to come + // from a disassembler bug. + O << "/*invalid immediate*/"; + break; + default: // We hit this for the immediate instruction bits that don't yet have a // custom printer. - // TODO: Eventually this should be unnecessary. - O << formatDec(Op.getImm()); + llvm_unreachable("unexpected immediate operand type"); } } else if (Op.isFPImm()) { // We special case 0.0 because otherwise it will be printed as an integer. @@ -406,12 +509,12 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, O << "0.0"; else { const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass); - - if (ImmRC.getSize() == 4) - printImmediate32(FloatToBits(Op.getFPImm()), O); - else if (ImmRC.getSize() == 8) - printImmediate64(DoubleToBits(Op.getFPImm()), O); + int RCID = Desc.OpInfo[OpNo].RegClass; + unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID)); + if (RCBits == 32) + printImmediate32(FloatToBits(Op.getFPImm()), STI, O); + else if (RCBits == 64) + printImmediate64(DoubleToBits(Op.getFPImm()), STI, O); else llvm_unreachable("Invalid register class size"); } @@ -424,32 +527,34 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, - unsigned OpNo, - raw_ostream &O) { + unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { unsigned InputModifiers = MI->getOperand(OpNo).getImm(); if (InputModifiers & SISrcMods::NEG) O << '-'; if (InputModifiers & SISrcMods::ABS) O << '|'; - printOperand(MI, OpNo + 1, O); + printOperand(MI, OpNo + 1, STI, O); if (InputModifiers & SISrcMods::ABS) O << '|'; } void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, - unsigned OpNo, - raw_ostream &O) { + unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { unsigned InputModifiers = MI->getOperand(OpNo).getImm(); if (InputModifiers & SISrcMods::SEXT) O << "sext("; - printOperand(MI, OpNo + 1, O); + printOperand(MI, OpNo + 1, STI, O); if (InputModifiers & SISrcMods::SEXT) O << ')'; } - void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, + raw_ostream &O) { unsigned Imm = MI->getOperand(OpNo).getImm(); if (Imm <= 0x0ff) { O << " quad_perm:["; @@ -488,19 +593,22 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printRowMask(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, + raw_ostream &O) { O << " row_mask:"; - printU4ImmOperand(MI, OpNo, O); + printU4ImmOperand(MI, OpNo, STI, O); } void AMDGPUInstPrinter::printBankMask(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, + raw_ostream &O) { O << " bank_mask:"; - printU4ImmOperand(MI, OpNo, O); + printU4ImmOperand(MI, OpNo, STI, O); } void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, + raw_ostream &O) { unsigned Imm = MI->getOperand(OpNo).getImm(); if (Imm) { O << " bound_ctrl:0"; // XXX - this syntax is used in sp3 @@ -509,69 +617,180 @@ void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + using namespace llvm::AMDGPU::SDWA; + unsigned Imm = MI->getOperand(OpNo).getImm(); switch (Imm) { - case 0: O << "BYTE_0"; break; - case 1: O << "BYTE_1"; break; - case 2: O << "BYTE_2"; break; - case 3: O << "BYTE_3"; break; - case 4: O << "WORD_0"; break; - case 5: O << "WORD_1"; break; - case 6: O << "DWORD"; break; + case SdwaSel::BYTE_0: O << "BYTE_0"; break; + case SdwaSel::BYTE_1: O << "BYTE_1"; break; + case SdwaSel::BYTE_2: O << "BYTE_2"; break; + case SdwaSel::BYTE_3: O << "BYTE_3"; break; + case SdwaSel::WORD_0: O << "WORD_0"; break; + case SdwaSel::WORD_1: O << "WORD_1"; break; + case SdwaSel::DWORD: O << "DWORD"; break; default: llvm_unreachable("Invalid SDWA data select operand"); } } void AMDGPUInstPrinter::printSDWADstSel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { O << "dst_sel:"; printSDWASel(MI, OpNo, O); } void AMDGPUInstPrinter::printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { O << "src0_sel:"; printSDWASel(MI, OpNo, O); } void AMDGPUInstPrinter::printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { O << "src1_sel:"; printSDWASel(MI, OpNo, O); } void AMDGPUInstPrinter::printSDWADstUnused(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + using namespace llvm::AMDGPU::SDWA; + O << "dst_unused:"; unsigned Imm = MI->getOperand(OpNo).getImm(); switch (Imm) { - case 0: O << "UNUSED_PAD"; break; - case 1: O << "UNUSED_SEXT"; break; - case 2: O << "UNUSED_PRESERVE"; break; + case DstUnused::UNUSED_PAD: O << "UNUSED_PAD"; break; + case DstUnused::UNUSED_SEXT: O << "UNUSED_SEXT"; break; + case DstUnused::UNUSED_PRESERVE: O << "UNUSED_PRESERVE"; break; default: llvm_unreachable("Invalid SDWA dest_unused operand"); } } +template <unsigned N> +void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int EnIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::en); + unsigned En = MI->getOperand(EnIdx).getImm(); + + // FIXME: What do we do with compr? The meaning of en changes depending on if + // compr is set. + + if (En & (1 << N)) + printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI); + else + O << "off"; +} + +void AMDGPUInstPrinter::printExpSrc0(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printExpSrcN<0>(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printExpSrc1(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printExpSrcN<1>(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printExpSrc2(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printExpSrcN<2>(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printExpSrc3(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printExpSrcN<3>(MI, OpNo, STI, O); +} + +void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // This is really a 6 bit field. + uint32_t Tgt = MI->getOperand(OpNo).getImm() & ((1 << 6) - 1); + + if (Tgt <= 7) + O << " mrt" << Tgt; + else if (Tgt == 8) + O << " mrtz"; + else if (Tgt == 9) + O << " null"; + else if (Tgt >= 12 && Tgt <= 15) + O << " pos" << Tgt - 12; + else if (Tgt >= 32 && Tgt <= 63) + O << " param" << Tgt - 32; + else { + // Reserved values 10, 11 + O << " invalid_target_" << Tgt; + } +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Imm = MI->getOperand(OpNum).getImm(); + switch (Imm) { + case 0: + O << "p10"; + break; + case 1: + O << "p20"; + break; + case 2: + O << "p0"; + break; + default: + O << "invalid_param_" << Imm; + } +} - if (Imm == 2) { - O << "P0"; - } else if (Imm == 1) { - O << "P20"; - } else if (Imm == 0) { - O << "P10"; - } else { - llvm_unreachable("Invalid interpolation parameter slot"); +void AMDGPUInstPrinter::printInterpAttr(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Attr = MI->getOperand(OpNum).getImm(); + O << "attr" << Attr; +} + +void AMDGPUInstPrinter::printInterpAttrChan(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Chan = MI->getOperand(OpNum).getImm(); + O << '.' << "xyzw"[Chan & 0x3]; +} + +void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + if (Val == 0) { + O << " 0"; + return; } + + if (Val & VGPRIndexMode::DST_ENABLE) + O << " dst"; + + if (Val & VGPRIndexMode::SRC0_ENABLE) + O << " src0"; + + if (Val & VGPRIndexMode::SRC1_ENABLE) + O << " src1"; + + if (Val & VGPRIndexMode::SRC2_ENABLE) + O << " src2"; } void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { - printOperand(MI, OpNo, O); + printOperand(MI, OpNo, STI, O); O << ", "; - printOperand(MI, OpNo + 1, O); + printOperand(MI, OpNo + 1, STI, O); } void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, @@ -595,23 +814,25 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printIfSet(MI, OpNo, O, '|'); } void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printIfSet(MI, OpNo, O, "_SAT"); } void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { if (MI->getOperand(OpNo).getImm()) O << " clamp"; } void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, + raw_ostream &O) { int Imm = MI->getOperand(OpNo).getImm(); if (Imm == SIOutMods::MUL2) O << " mul:2"; @@ -622,6 +843,7 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); assert(Op.isImm() || Op.isExpr()); @@ -635,17 +857,17 @@ void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printIfSet(MI, OpNo, O, "*", " "); } void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printIfSet(MI, OpNo, O, '-'); } void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { switch (MI->getOperand(OpNo).getImm()) { default: break; case 1: @@ -661,22 +883,24 @@ void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { printIfSet(MI, OpNo, O, '+'); } void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { printIfSet(MI, OpNo, O, "ExecMask,"); } void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { printIfSet(MI, OpNo, O, "Pred,"); } void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.getImm() == 0) { O << " (MASKED)"; @@ -684,7 +908,7 @@ void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + raw_ostream &O) { const char * chans = "XYZW"; int sel = MI->getOperand(OpNo).getImm(); @@ -708,6 +932,7 @@ void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { int BankSwizzle = MI->getOperand(OpNo).getImm(); switch (BankSwizzle) { @@ -729,11 +954,10 @@ void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, default: break; } - return; } void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Sel = MI->getOperand(OpNo).getImm(); switch (Sel) { case 0: @@ -763,7 +987,7 @@ void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned CT = MI->getOperand(OpNo).getImm(); switch (CT) { case 0: @@ -778,7 +1002,7 @@ void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { int KCacheMode = MI->getOperand(OpNo).getImm(); if (KCacheMode > 0) { int KCacheBank = MI->getOperand(OpNo - 2).getImm(); @@ -790,6 +1014,7 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { using namespace llvm::AMDGPU::SendMsg; @@ -825,32 +1050,34 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, O << "sendmsg(" << IdSymbolic[Id] << ", " << OpSysSymbolic[OpSys] << ')'; return; } - } while (0); + } while (false); O << SImm16; // Unknown simm16 code. } void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + IsaVersion IV = getIsaVersion(STI.getFeatureBits()); + unsigned SImm16 = MI->getOperand(OpNo).getImm(); - unsigned Vmcnt = SImm16 & 0xF; - unsigned Expcnt = (SImm16 >> 4) & 0x7; - unsigned Lgkmcnt = (SImm16 >> 8) & 0xF; + unsigned Vmcnt, Expcnt, Lgkmcnt; + decodeWaitcnt(IV, SImm16, Vmcnt, Expcnt, Lgkmcnt); bool NeedSpace = false; - if (Vmcnt != 0xF) { + if (Vmcnt != getVmcntBitMask(IV)) { O << "vmcnt(" << Vmcnt << ')'; NeedSpace = true; } - if (Expcnt != 0x7) { + if (Expcnt != getExpcntBitMask(IV)) { if (NeedSpace) O << ' '; O << "expcnt(" << Expcnt << ')'; NeedSpace = true; } - if (Lgkmcnt != 0xF) { + if (Lgkmcnt != getLgkmcntBitMask(IV)) { if (NeedSpace) O << ' '; O << "lgkmcnt(" << Lgkmcnt << ')'; @@ -858,7 +1085,7 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { using namespace llvm::AMDGPU::Hwreg; unsigned SImm16 = MI->getOperand(OpNo).getImm(); diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index f5a290f..a6d348f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -24,7 +24,8 @@ public: : MCInstPrinter(MAI, MII, MRI) {} //Autogenerated by tblgen - void printInstruction(const MCInst *MI, raw_ostream &O); + void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); static const char *getRegisterName(unsigned RegNo); void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, @@ -33,76 +34,159 @@ public: const MCRegisterInfo &MRI); private: - void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU4ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printNamedBit(const MCInst* MI, unsigned OpNo, raw_ostream& O, + void printU32ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef BitName); void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSMRDOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printUNorm(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDA(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printR128(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printLWE(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printSMRDOffset8(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSMRDOffset20(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printR128(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printLWE(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpCompr(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpVM(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printRegOperand(unsigned RegNo, raw_ostream &O); - void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printImmediate32(uint32_t I, raw_ostream &O); - void printImmediate64(uint64_t I, raw_ostream &O); - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDPPCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printRowMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printBankMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printBoundCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); + void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); + void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printRowMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printBankMask(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printBoundCtrl(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSDWADstSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSDWADstUnused(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSDWADstSel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSDWADstUnused(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printInterpSlot(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printInterpAttr(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printInterpAttrChan(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printVGPRIndexMode(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMemOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + + + template <unsigned N> + void printExpSrcN(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpSrc0(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpSrc1(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpSrc2(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpSrc3(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExpTgt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm, StringRef Default = ""); - static void printIfSet(const MCInst *MI, unsigned OpNo, - raw_ostream &O, char Asm); - static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printUpdateExecMask(const MCInst *MI, unsigned OpNo, - raw_ostream &O); - static void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printHwreg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, + char Asm); + void printAbs(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printClamp(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printClampSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOModSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printLiteral(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printLast(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printNeg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOMOD(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printRel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printUpdateExecMask(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printUpdatePred(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printWrite(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBankSwizzle(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printRSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printCT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printKCache(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printWaitFlag(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 1cb9d21..ffb92aa 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -13,6 +13,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" @@ -22,30 +23,19 @@ using namespace llvm; namespace { -class AMDGPUMCObjectWriter : public MCObjectWriter { -public: - AMDGPUMCObjectWriter(raw_pwrite_stream &OS) : MCObjectWriter(OS, true) {} - void executePostLayoutBinding(MCAssembler &Asm, - const MCAsmLayout &Layout) override { - //XXX: Implement if necessary. - } - void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFragment *Fragment, const MCFixup &Fixup, - MCValue Target, bool &IsPCRel, - uint64_t &FixedValue) override { - assert(!"Not implemented"); - } - - void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override; - -}; - class AMDGPUAsmBackend : public MCAsmBackend { public: AMDGPUAsmBackend(const Target &T) : MCAsmBackend() {} unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; }; + + void processFixupValue(const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFixup &Fixup, const MCFragment *DF, + const MCValue &Target, uint64_t &Value, + bool &IsResolved) override; + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, @@ -55,7 +45,7 @@ public: } void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, MCInst &Res) const override { - assert(!"Not implemented"); + llvm_unreachable("Not implemented"); } bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; @@ -65,15 +55,10 @@ public: } //End anonymous namespace -void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm, - const MCAsmLayout &Layout) { - for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) { - Asm.writeSectionData(&*I, Layout); - } -} - static unsigned getFixupKindNumBytes(unsigned Kind) { switch (Kind) { + case AMDGPU::fixup_si_sopp_br: + return 2; case FK_SecRel_1: case FK_Data_1: return 1; @@ -92,40 +77,77 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { } } +static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, + MCContext *Ctx) { + int64_t SignedValue = static_cast<int64_t>(Value); + + switch (Fixup.getKind()) { + case AMDGPU::fixup_si_sopp_br: { + int64_t BrImm = (SignedValue - 4) / 4; + + if (Ctx && !isInt<16>(BrImm)) + Ctx->reportError(Fixup.getLoc(), "branch size exceeds simm16"); + + return BrImm; + } + case FK_Data_1: + case FK_Data_2: + case FK_Data_4: + case FK_Data_8: + case FK_PCRel_4: + case FK_SecRel_4: + return Value; + default: + llvm_unreachable("unhandled fixup kind"); + } +} + +void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFixup &Fixup, const MCFragment *DF, + const MCValue &Target, uint64_t &Value, + bool &IsResolved) { + MCValue Res; + + // When we have complex expressions like: BB0_1 + (BB0_2 - 4), which are + // used for long branches, this function will be called with + // IsResolved = false and Value set to some pre-computed value. In + // the example above, the value would be: + // (BB0_1 + (BB0_2 - 4)) - CurrentOffsetFromStartOfFunction. + // This is not what we want. We just want the expression computation + // only. The reason the MC layer subtracts the current offset from the + // expression is because the fixup is of kind FK_PCRel_4. + // For these scenarios, evaluateAsValue gives us the computation that we + // want. + if (!IsResolved && Fixup.getValue()->evaluateAsValue(Res, Layout) && + Res.isAbsolute()) { + Value = Res.getConstant(); + IsResolved = true; + + } + if (IsResolved) + Value = adjustFixupValue(Fixup, Value, &Asm.getContext()); +} + void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const { + if (!Value) + return; // Doesn't change encoding. - switch ((unsigned)Fixup.getKind()) { - case AMDGPU::fixup_si_sopp_br: { - int64_t BrImm = ((int64_t)Value - 4) / 4; - if (!isInt<16>(BrImm)) - report_fatal_error("branch size exceeds simm16"); - - uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); - *Dst = BrImm; - break; - } - - default: { - // FIXME: Copied from AArch64 - unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - if (!Value) - return; // Doesn't change encoding. - MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); - - // Shift the value into position. - Value <<= Info.TargetOffset; - - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); - - // For each byte of the fragment that the fixup touches, mask in the - // bits from the fixup value. - for (unsigned i = 0; i != NumBytes; ++i) - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); - } - } + MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); + + // Shift the value into position. + Value <<= Info.TargetOffset; + + unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); + uint32_t Offset = Fixup.getOffset(); + assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + + // For each byte of the fragment that the fixup touches, mask in the bits from + // the fixup value. + for (unsigned i = 0; i != NumBytes; ++i) + Data[Offset + i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); } const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( @@ -171,7 +193,8 @@ public: MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU) { + const Triple &TT, StringRef CPU, + const MCTargetOptions &Options) { // Use 64-bit ELF for amdgcn return new ELFAMDGPUAsmBackend(T, TT); } diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index b4e3b8e..1847d7a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -38,26 +38,40 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - // SCRATCH_RSRC_DWORD[01] is a special global variable that represents - // the scratch buffer. - if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD0") - return ELF::R_AMDGPU_ABS32_LO; - if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD1") - return ELF::R_AMDGPU_ABS32_HI; + if (const auto *SymA = Target.getSymA()) { + // SCRATCH_RSRC_DWORD[01] is a special global variable that represents + // the scratch buffer. + if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0") + return ELF::R_AMDGPU_ABS32_LO; + + if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1") + return ELF::R_AMDGPU_ABS32_HI; + } switch (Target.getAccessVariant()) { default: break; case MCSymbolRefExpr::VK_GOTPCREL: return ELF::R_AMDGPU_GOTPCREL; + case MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_LO: + return ELF::R_AMDGPU_GOTPCREL32_LO; + case MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_HI: + return ELF::R_AMDGPU_GOTPCREL32_HI; + case MCSymbolRefExpr::VK_AMDGPU_REL32_LO: + return ELF::R_AMDGPU_REL32_LO; + case MCSymbolRefExpr::VK_AMDGPU_REL32_HI: + return ELF::R_AMDGPU_REL32_HI; } switch (Fixup.getKind()) { default: break; case FK_PCRel_4: return ELF::R_AMDGPU_REL32; + case FK_Data_4: case FK_SecRel_4: return ELF::R_AMDGPU_ABS32; + case FK_Data_8: + return ELF::R_AMDGPU_ABS64; } llvm_unreachable("unhandled relocation type"); diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index c942ea9..3d3858a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -21,11 +21,19 @@ namespace llvm { class MCInst; +class MCInstrInfo; class MCOperand; class MCSubtargetInfo; +class FeatureBitset; class AMDGPUMCCodeEmitter : public MCCodeEmitter { virtual void anchor(); + +protected: + const MCInstrInfo &MCII; + + AMDGPUMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {} + public: uint64_t getBinaryCodeForInstr(const MCInst &MI, @@ -43,6 +51,11 @@ public: const MCSubtargetInfo &STI) const { return 0; } + +protected: + uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; + void verifyInstructionPredicates(const MCInst &MI, + uint64_t AvailableFeatures) const; }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index a0d9aab..136e6ec 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -86,7 +86,7 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context, } extern "C" void LLVMInitializeAMDGPUTargetMC() { - for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { + for (Target *T : {&getTheAMDGPUTarget(), &getTheGCNTarget()}) { RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T); TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo); @@ -98,14 +98,15 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() { } // R600 specific registration - TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, + TargetRegistry::RegisterMCCodeEmitter(getTheAMDGPUTarget(), createR600MCCodeEmitter); // GCN specific registration - TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createSIMCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(getTheGCNTarget(), + createSIMCCodeEmitter); - TargetRegistry::RegisterAsmTargetStreamer(TheGCNTarget, + TargetRegistry::RegisterAsmTargetStreamer(getTheGCNTarget(), createAMDGPUAsmTargetStreamer); - TargetRegistry::RegisterObjectTargetStreamer(TheGCNTarget, - createAMDGPUObjectTargetStreamer); + TargetRegistry::RegisterObjectTargetStreamer( + getTheGCNTarget(), createAMDGPUObjectTargetStreamer); } diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 9ab7940..548bad5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -19,7 +19,6 @@ #include "llvm/Support/DataTypes.h" namespace llvm { -class StringRef; class MCAsmBackend; class MCCodeEmitter; class MCContext; @@ -27,13 +26,14 @@ class MCInstrInfo; class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; +class MCTargetOptions; +class StringRef; class Target; class Triple; class raw_pwrite_stream; -class raw_ostream; -extern Target TheAMDGPUTarget; -extern Target TheGCNTarget; +Target &getTheAMDGPUTarget(); +Target &getTheGCNTarget(); MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, @@ -44,7 +44,8 @@ MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx); MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU); + const Triple &TT, StringRef CPU, + const MCTargetOptions &Options); MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend, diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp new file mode 100644 index 0000000..95387ad --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp @@ -0,0 +1,408 @@ +//===-- AMDGPURuntimeMD.cpp - Generates runtime metadata ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// Generates AMDGPU runtime metadata for YAML mapping. +// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPURuntimeMetadata.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/YAMLTraits.h" +#include <vector> +#include "AMDGPURuntimeMD.h" + +using namespace llvm; +using namespace ::AMDGPU::RuntimeMD; + +static cl::opt<bool> +DumpRuntimeMD("amdgpu-dump-rtmd", + cl::desc("Dump AMDGPU runtime metadata")); + +static cl::opt<bool> +CheckRuntimeMDParser("amdgpu-check-rtmd-parser", cl::Hidden, + cl::desc("Check AMDGPU runtime metadata YAML parser")); + +LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint8_t) +LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t) +LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string) +LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata) +LLVM_YAML_IS_SEQUENCE_VECTOR(KernelArg::Metadata) + +namespace llvm { +namespace yaml { + +template <> struct MappingTraits<KernelArg::Metadata> { + static void mapping(IO &YamlIO, KernelArg::Metadata &A) { + YamlIO.mapRequired(KeyName::ArgSize, A.Size); + YamlIO.mapRequired(KeyName::ArgAlign, A.Align); + YamlIO.mapOptional(KeyName::ArgPointeeAlign, A.PointeeAlign, 0U); + YamlIO.mapRequired(KeyName::ArgKind, A.Kind); + YamlIO.mapRequired(KeyName::ArgValueType, A.ValueType); + YamlIO.mapOptional(KeyName::ArgTypeName, A.TypeName, std::string()); + YamlIO.mapOptional(KeyName::ArgName, A.Name, std::string()); + YamlIO.mapOptional(KeyName::ArgAddrQual, A.AddrQual, INVALID_ADDR_QUAL); + YamlIO.mapOptional(KeyName::ArgAccQual, A.AccQual, INVALID_ACC_QUAL); + YamlIO.mapOptional(KeyName::ArgIsVolatile, A.IsVolatile, uint8_t(0)); + YamlIO.mapOptional(KeyName::ArgIsConst, A.IsConst, uint8_t(0)); + YamlIO.mapOptional(KeyName::ArgIsRestrict, A.IsRestrict, uint8_t(0)); + YamlIO.mapOptional(KeyName::ArgIsPipe, A.IsPipe, uint8_t(0)); + } + static const bool flow = true; +}; + +template <> struct MappingTraits<Kernel::Metadata> { + static void mapping(IO &YamlIO, Kernel::Metadata &K) { + YamlIO.mapRequired(KeyName::KernelName, K.Name); + YamlIO.mapOptional(KeyName::Language, K.Language, std::string()); + YamlIO.mapOptional(KeyName::LanguageVersion, K.LanguageVersion); + YamlIO.mapOptional(KeyName::ReqdWorkGroupSize, K.ReqdWorkGroupSize); + YamlIO.mapOptional(KeyName::WorkGroupSizeHint, K.WorkGroupSizeHint); + YamlIO.mapOptional(KeyName::VecTypeHint, K.VecTypeHint, std::string()); + YamlIO.mapOptional(KeyName::KernelIndex, K.KernelIndex, + INVALID_KERNEL_INDEX); + YamlIO.mapOptional(KeyName::NoPartialWorkGroups, K.NoPartialWorkGroups, + uint8_t(0)); + YamlIO.mapRequired(KeyName::Args, K.Args); + } + static const bool flow = true; +}; + +template <> struct MappingTraits<Program::Metadata> { + static void mapping(IO &YamlIO, Program::Metadata &Prog) { + YamlIO.mapRequired(KeyName::MDVersion, Prog.MDVersionSeq); + YamlIO.mapOptional(KeyName::PrintfInfo, Prog.PrintfInfo); + YamlIO.mapOptional(KeyName::Kernels, Prog.Kernels); + } + static const bool flow = true; +}; + +} // end namespace yaml +} // end namespace llvm + +// Get a vector of three integer values from MDNode \p Node; +static std::vector<uint32_t> getThreeInt32(MDNode *Node) { + assert(Node->getNumOperands() == 3); + std::vector<uint32_t> V; + for (const MDOperand &Op : Node->operands()) { + const ConstantInt *CI = mdconst::extract<ConstantInt>(Op); + V.push_back(CI->getZExtValue()); + } + return V; +} + +static std::string getOCLTypeName(Type *Ty, bool Signed) { + switch (Ty->getTypeID()) { + case Type::HalfTyID: + return "half"; + case Type::FloatTyID: + return "float"; + case Type::DoubleTyID: + return "double"; + case Type::IntegerTyID: { + if (!Signed) + return (Twine('u') + getOCLTypeName(Ty, true)).str(); + unsigned BW = Ty->getIntegerBitWidth(); + switch (BW) { + case 8: + return "char"; + case 16: + return "short"; + case 32: + return "int"; + case 64: + return "long"; + default: + return (Twine('i') + Twine(BW)).str(); + } + } + case Type::VectorTyID: { + VectorType *VecTy = cast<VectorType>(Ty); + Type *EleTy = VecTy->getElementType(); + unsigned Size = VecTy->getVectorNumElements(); + return (Twine(getOCLTypeName(EleTy, Signed)) + Twine(Size)).str(); + } + default: + return "unknown"; + } +} + +static KernelArg::ValueType getRuntimeMDValueType( + Type *Ty, StringRef TypeName) { + switch (Ty->getTypeID()) { + case Type::HalfTyID: + return KernelArg::F16; + case Type::FloatTyID: + return KernelArg::F32; + case Type::DoubleTyID: + return KernelArg::F64; + case Type::IntegerTyID: { + bool Signed = !TypeName.startswith("u"); + switch (Ty->getIntegerBitWidth()) { + case 8: + return Signed ? KernelArg::I8 : KernelArg::U8; + case 16: + return Signed ? KernelArg::I16 : KernelArg::U16; + case 32: + return Signed ? KernelArg::I32 : KernelArg::U32; + case 64: + return Signed ? KernelArg::I64 : KernelArg::U64; + default: + // Runtime does not recognize other integer types. Report as struct type. + return KernelArg::Struct; + } + } + case Type::VectorTyID: + return getRuntimeMDValueType(Ty->getVectorElementType(), TypeName); + case Type::PointerTyID: + return getRuntimeMDValueType(Ty->getPointerElementType(), TypeName); + default: + return KernelArg::Struct; + } +} + +static KernelArg::AddressSpaceQualifer getRuntimeAddrSpace( + AMDGPUAS::AddressSpaces A) { + switch (A) { + case AMDGPUAS::GLOBAL_ADDRESS: + return KernelArg::Global; + case AMDGPUAS::CONSTANT_ADDRESS: + return KernelArg::Constant; + case AMDGPUAS::LOCAL_ADDRESS: + return KernelArg::Local; + case AMDGPUAS::FLAT_ADDRESS: + return KernelArg::Generic; + case AMDGPUAS::REGION_ADDRESS: + return KernelArg::Region; + default: + return KernelArg::Private; + } +} + +static KernelArg::Metadata getRuntimeMDForKernelArg(const DataLayout &DL, + Type *T, KernelArg::Kind Kind, StringRef BaseTypeName = "", + StringRef TypeName = "", StringRef ArgName = "", StringRef TypeQual = "", + StringRef AccQual = "") { + + KernelArg::Metadata Arg; + + // Set ArgSize and ArgAlign. + Arg.Size = DL.getTypeAllocSize(T); + Arg.Align = DL.getABITypeAlignment(T); + if (auto PT = dyn_cast<PointerType>(T)) { + auto ET = PT->getElementType(); + if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && ET->isSized()) + Arg.PointeeAlign = DL.getABITypeAlignment(ET); + } + + // Set ArgTypeName. + Arg.TypeName = TypeName; + + // Set ArgName. + Arg.Name = ArgName; + + // Set ArgIsVolatile, ArgIsRestrict, ArgIsConst and ArgIsPipe. + SmallVector<StringRef, 1> SplitQ; + TypeQual.split(SplitQ, " ", -1, false /* Drop empty entry */); + + for (StringRef KeyName : SplitQ) { + auto *P = StringSwitch<uint8_t *>(KeyName) + .Case("volatile", &Arg.IsVolatile) + .Case("restrict", &Arg.IsRestrict) + .Case("const", &Arg.IsConst) + .Case("pipe", &Arg.IsPipe) + .Default(nullptr); + if (P) + *P = 1; + } + + // Set ArgKind. + Arg.Kind = Kind; + + // Set ArgValueType. + Arg.ValueType = getRuntimeMDValueType(T, BaseTypeName); + + // Set ArgAccQual. + if (!AccQual.empty()) { + Arg.AccQual = StringSwitch<KernelArg::AccessQualifer>(AccQual) + .Case("read_only", KernelArg::ReadOnly) + .Case("write_only", KernelArg::WriteOnly) + .Case("read_write", KernelArg::ReadWrite) + .Default(KernelArg::AccNone); + } + + // Set ArgAddrQual. + if (auto *PT = dyn_cast<PointerType>(T)) { + Arg.AddrQual = getRuntimeAddrSpace(static_cast<AMDGPUAS::AddressSpaces>( + PT->getAddressSpace())); + } + + return Arg; +} + +static Kernel::Metadata getRuntimeMDForKernel(const Function &F) { + Kernel::Metadata Kernel; + Kernel.Name = F.getName(); + auto &M = *F.getParent(); + + // Set Language and LanguageVersion. + if (auto MD = M.getNamedMetadata("opencl.ocl.version")) { + if (MD->getNumOperands() != 0) { + auto Node = MD->getOperand(0); + if (Node->getNumOperands() > 1) { + Kernel.Language = "OpenCL C"; + uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0)) + ->getZExtValue(); + uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1)) + ->getZExtValue(); + Kernel.LanguageVersion.push_back(Major); + Kernel.LanguageVersion.push_back(Minor); + } + } + } + + const DataLayout &DL = F.getParent()->getDataLayout(); + for (auto &Arg : F.args()) { + unsigned I = Arg.getArgNo(); + Type *T = Arg.getType(); + auto TypeName = dyn_cast<MDString>(F.getMetadata( + "kernel_arg_type")->getOperand(I))->getString(); + auto BaseTypeName = cast<MDString>(F.getMetadata( + "kernel_arg_base_type")->getOperand(I))->getString(); + StringRef ArgName; + if (auto ArgNameMD = F.getMetadata("kernel_arg_name")) + ArgName = cast<MDString>(ArgNameMD->getOperand(I))->getString(); + auto TypeQual = cast<MDString>(F.getMetadata( + "kernel_arg_type_qual")->getOperand(I))->getString(); + auto AccQual = cast<MDString>(F.getMetadata( + "kernel_arg_access_qual")->getOperand(I))->getString(); + KernelArg::Kind Kind; + if (TypeQual.find("pipe") != StringRef::npos) + Kind = KernelArg::Pipe; + else Kind = StringSwitch<KernelArg::Kind>(BaseTypeName) + .Case("sampler_t", KernelArg::Sampler) + .Case("queue_t", KernelArg::Queue) + .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t", + "image2d_t" , "image2d_array_t", KernelArg::Image) + .Cases("image2d_depth_t", "image2d_array_depth_t", + "image2d_msaa_t", "image2d_array_msaa_t", + "image2d_msaa_depth_t", KernelArg::Image) + .Cases("image2d_array_msaa_depth_t", "image3d_t", + KernelArg::Image) + .Default(isa<PointerType>(T) ? + (T->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ? + KernelArg::DynamicSharedPointer : + KernelArg::GlobalBuffer) : + KernelArg::ByValue); + Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, T, Kind, + BaseTypeName, TypeName, ArgName, TypeQual, AccQual)); + } + + // Emit hidden kernel arguments for OpenCL kernels. + if (F.getParent()->getNamedMetadata("opencl.ocl.version")) { + auto Int64T = Type::getInt64Ty(F.getContext()); + Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T, + KernelArg::HiddenGlobalOffsetX)); + Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T, + KernelArg::HiddenGlobalOffsetY)); + Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T, + KernelArg::HiddenGlobalOffsetZ)); + if (F.getParent()->getNamedMetadata("llvm.printf.fmts")) { + auto Int8PtrT = Type::getInt8PtrTy(F.getContext(), + KernelArg::Global); + Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int8PtrT, + KernelArg::HiddenPrintfBuffer)); + } + } + + // Set ReqdWorkGroupSize, WorkGroupSizeHint, and VecTypeHint. + if (auto RWGS = F.getMetadata("reqd_work_group_size")) + Kernel.ReqdWorkGroupSize = getThreeInt32(RWGS); + + if (auto WGSH = F.getMetadata("work_group_size_hint")) + Kernel.WorkGroupSizeHint = getThreeInt32(WGSH); + + if (auto VTH = F.getMetadata("vec_type_hint")) + Kernel.VecTypeHint = getOCLTypeName(cast<ValueAsMetadata>( + VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>( + VTH->getOperand(1))->getZExtValue()); + + return Kernel; +} + +Program::Metadata::Metadata(const std::string &YAML) { + yaml::Input Input(YAML); + Input >> *this; +} + +std::string Program::Metadata::toYAML(void) { + std::string Text; + raw_string_ostream Stream(Text); + yaml::Output Output(Stream, nullptr, INT_MAX /* do not wrap line */); + Output << *this; + return Stream.str(); +} + +Program::Metadata Program::Metadata::fromYAML(const std::string &S) { + return Program::Metadata(S); +} + +// Check if the YAML string can be parsed. +static void checkRuntimeMDYAMLString(const std::string &YAML) { + auto P = Program::Metadata::fromYAML(YAML); + auto S = P.toYAML(); + llvm::errs() << "AMDGPU runtime metadata parser test " + << (YAML == S ? "passes" : "fails") << ".\n"; + if (YAML != S) { + llvm::errs() << "First output: " << YAML << '\n' + << "Second output: " << S << '\n'; + } +} + +std::string llvm::getRuntimeMDYAMLString(Module &M) { + Program::Metadata Prog; + Prog.MDVersionSeq.push_back(MDVersion); + Prog.MDVersionSeq.push_back(MDRevision); + + // Set PrintfInfo. + if (auto MD = M.getNamedMetadata("llvm.printf.fmts")) { + for (unsigned I = 0; I < MD->getNumOperands(); ++I) { + auto Node = MD->getOperand(I); + if (Node->getNumOperands() > 0) + Prog.PrintfInfo.push_back(cast<MDString>(Node->getOperand(0)) + ->getString()); + } + } + + // Set Kernels. + for (auto &F: M.functions()) { + if (!F.getMetadata("kernel_arg_type")) + continue; + Prog.Kernels.emplace_back(getRuntimeMDForKernel(F)); + } + + auto YAML = Prog.toYAML(); + + if (DumpRuntimeMD) + llvm::errs() << "AMDGPU runtime metadata:\n" << YAML << '\n'; + + if (CheckRuntimeMDParser) + checkRuntimeMDYAMLString(YAML); + + return YAML; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h new file mode 100644 index 0000000..a92fdd4 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h @@ -0,0 +1,26 @@ +//===- AMDGPURuntimeMD.h - Generate runtime metadata ---------------*- C++ -*-// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares functions for generating runtime metadata. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H + +#include <string> + +namespace llvm { +class Module; + +// Get runtime metadata as YAML string. +std::string getRuntimeMDYAMLString(Module &M); + +} +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 83dcaac..3392183 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -11,21 +11,33 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPU.h" #include "AMDGPUTargetStreamer.h" #include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDKernelCodeTUtils.h" #include "llvm/ADT/Twine.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/Support/ELF.h" #include "llvm/Support/FormattedStream.h" +#include "AMDGPURuntimeMD.h" + +namespace llvm { +#include "AMDGPUPTNote.h" +} using namespace llvm; +using namespace llvm::AMDGPU; AMDGPUTargetStreamer::AMDGPUTargetStreamer(MCStreamer &S) - : MCTargetStreamer(S) { } + : MCTargetStreamer(S) {} //===----------------------------------------------------------------------===// // AMDGPUTargetAsmStreamer @@ -56,169 +68,9 @@ AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major, void AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { - uint64_t ComputePgmRsrc2 = (Header.compute_pgm_resource_registers >> 32); - bool EnableSGPRPrivateSegmentBuffer = (Header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); - bool EnableSGPRDispatchPtr = (Header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR); - bool EnableSGPRQueuePtr = (Header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR); - bool EnableSGPRKernargSegmentPtr = (Header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR); - bool EnableSGPRDispatchID = (Header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); - bool EnableSGPRFlatScratchInit = (Header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); - bool EnableSGPRPrivateSegmentSize = (Header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); - bool EnableSGPRGridWorkgroupCountX = (Header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X); - bool EnableSGPRGridWorkgroupCountY = (Header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y); - bool EnableSGPRGridWorkgroupCountZ = (Header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z); - bool EnableOrderedAppendGDS = (Header.code_properties & - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS); - uint32_t PrivateElementSize = (Header.code_properties & - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE) >> - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT; - bool IsPtr64 = (Header.code_properties & AMD_CODE_PROPERTY_IS_PTR64); - bool IsDynamicCallstack = (Header.code_properties & - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK); - bool IsDebugEnabled = (Header.code_properties & - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED); - bool IsXNackEnabled = (Header.code_properties & - AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED); - - OS << "\t.amd_kernel_code_t\n" << - "\t\tkernel_code_version_major = " << - Header.amd_kernel_code_version_major << '\n' << - "\t\tkernel_code_version_minor = " << - Header.amd_kernel_code_version_minor << '\n' << - "\t\tmachine_kind = " << - Header.amd_machine_kind << '\n' << - "\t\tmachine_version_major = " << - Header.amd_machine_version_major << '\n' << - "\t\tmachine_version_minor = " << - Header.amd_machine_version_minor << '\n' << - "\t\tmachine_version_stepping = " << - Header.amd_machine_version_stepping << '\n' << - "\t\tkernel_code_entry_byte_offset = " << - Header.kernel_code_entry_byte_offset << '\n' << - "\t\tkernel_code_prefetch_byte_size = " << - Header.kernel_code_prefetch_byte_size << '\n' << - "\t\tmax_scratch_backing_memory_byte_size = " << - Header.max_scratch_backing_memory_byte_size << '\n' << - "\t\tcompute_pgm_rsrc1_vgprs = " << - G_00B848_VGPRS(Header.compute_pgm_resource_registers) << '\n' << - "\t\tcompute_pgm_rsrc1_sgprs = " << - G_00B848_SGPRS(Header.compute_pgm_resource_registers) << '\n' << - "\t\tcompute_pgm_rsrc1_priority = " << - G_00B848_PRIORITY(Header.compute_pgm_resource_registers) << '\n' << - "\t\tcompute_pgm_rsrc1_float_mode = " << - G_00B848_FLOAT_MODE(Header.compute_pgm_resource_registers) << '\n' << - "\t\tcompute_pgm_rsrc1_priv = " << - G_00B848_PRIV(Header.compute_pgm_resource_registers) << '\n' << - "\t\tcompute_pgm_rsrc1_dx10_clamp = " << - G_00B848_DX10_CLAMP(Header.compute_pgm_resource_registers) << '\n' << - "\t\tcompute_pgm_rsrc1_debug_mode = " << - G_00B848_DEBUG_MODE(Header.compute_pgm_resource_registers) << '\n' << - "\t\tcompute_pgm_rsrc1_ieee_mode = " << - G_00B848_IEEE_MODE(Header.compute_pgm_resource_registers) << '\n' << - "\t\tcompute_pgm_rsrc2_scratch_en = " << - G_00B84C_SCRATCH_EN(ComputePgmRsrc2) << '\n' << - "\t\tcompute_pgm_rsrc2_user_sgpr = " << - G_00B84C_USER_SGPR(ComputePgmRsrc2) << '\n' << - "\t\tcompute_pgm_rsrc2_tgid_x_en = " << - G_00B84C_TGID_X_EN(ComputePgmRsrc2) << '\n' << - "\t\tcompute_pgm_rsrc2_tgid_y_en = " << - G_00B84C_TGID_Y_EN(ComputePgmRsrc2) << '\n' << - "\t\tcompute_pgm_rsrc2_tgid_z_en = " << - G_00B84C_TGID_Z_EN(ComputePgmRsrc2) << '\n' << - "\t\tcompute_pgm_rsrc2_tg_size_en = " << - G_00B84C_TG_SIZE_EN(ComputePgmRsrc2) << '\n' << - "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = " << - G_00B84C_TIDIG_COMP_CNT(ComputePgmRsrc2) << '\n' << - "\t\tcompute_pgm_rsrc2_excp_en_msb = " << - G_00B84C_EXCP_EN_MSB(ComputePgmRsrc2) << '\n' << - "\t\tcompute_pgm_rsrc2_lds_size = " << - G_00B84C_LDS_SIZE(ComputePgmRsrc2) << '\n' << - "\t\tcompute_pgm_rsrc2_excp_en = " << - G_00B84C_EXCP_EN(ComputePgmRsrc2) << '\n' << - - "\t\tenable_sgpr_private_segment_buffer = " << - EnableSGPRPrivateSegmentBuffer << '\n' << - "\t\tenable_sgpr_dispatch_ptr = " << - EnableSGPRDispatchPtr << '\n' << - "\t\tenable_sgpr_queue_ptr = " << - EnableSGPRQueuePtr << '\n' << - "\t\tenable_sgpr_kernarg_segment_ptr = " << - EnableSGPRKernargSegmentPtr << '\n' << - "\t\tenable_sgpr_dispatch_id = " << - EnableSGPRDispatchID << '\n' << - "\t\tenable_sgpr_flat_scratch_init = " << - EnableSGPRFlatScratchInit << '\n' << - "\t\tenable_sgpr_private_segment_size = " << - EnableSGPRPrivateSegmentSize << '\n' << - "\t\tenable_sgpr_grid_workgroup_count_x = " << - EnableSGPRGridWorkgroupCountX << '\n' << - "\t\tenable_sgpr_grid_workgroup_count_y = " << - EnableSGPRGridWorkgroupCountY << '\n' << - "\t\tenable_sgpr_grid_workgroup_count_z = " << - EnableSGPRGridWorkgroupCountZ << '\n' << - "\t\tenable_ordered_append_gds = " << - EnableOrderedAppendGDS << '\n' << - "\t\tprivate_element_size = " << - PrivateElementSize << '\n' << - "\t\tis_ptr64 = " << - IsPtr64 << '\n' << - "\t\tis_dynamic_callstack = " << - IsDynamicCallstack << '\n' << - "\t\tis_debug_enabled = " << - IsDebugEnabled << '\n' << - "\t\tis_xnack_enabled = " << - IsXNackEnabled << '\n' << - "\t\tworkitem_private_segment_byte_size = " << - Header.workitem_private_segment_byte_size << '\n' << - "\t\tworkgroup_group_segment_byte_size = " << - Header.workgroup_group_segment_byte_size << '\n' << - "\t\tgds_segment_byte_size = " << - Header.gds_segment_byte_size << '\n' << - "\t\tkernarg_segment_byte_size = " << - Header.kernarg_segment_byte_size << '\n' << - "\t\tworkgroup_fbarrier_count = " << - Header.workgroup_fbarrier_count << '\n' << - "\t\twavefront_sgpr_count = " << - Header.wavefront_sgpr_count << '\n' << - "\t\tworkitem_vgpr_count = " << - Header.workitem_vgpr_count << '\n' << - "\t\treserved_vgpr_first = " << - Header.reserved_vgpr_first << '\n' << - "\t\treserved_vgpr_count = " << - Header.reserved_vgpr_count << '\n' << - "\t\treserved_sgpr_first = " << - Header.reserved_sgpr_first << '\n' << - "\t\treserved_sgpr_count = " << - Header.reserved_sgpr_count << '\n' << - "\t\tdebug_wavefront_private_segment_offset_sgpr = " << - Header.debug_wavefront_private_segment_offset_sgpr << '\n' << - "\t\tdebug_private_segment_buffer_sgpr = " << - Header.debug_private_segment_buffer_sgpr << '\n' << - "\t\tkernarg_segment_alignment = " << - (uint32_t)Header.kernarg_segment_alignment << '\n' << - "\t\tgroup_segment_alignment = " << - (uint32_t)Header.group_segment_alignment << '\n' << - "\t\tprivate_segment_alignment = " << - (uint32_t)Header.private_segment_alignment << '\n' << - "\t\twavefront_size = " << - (uint32_t)Header.wavefront_size << '\n' << - "\t\tcall_convention = " << - Header.call_convention << '\n' << - "\t\truntime_loader_kernel_symbol = " << - Header.runtime_loader_kernel_symbol << '\n' << - // TODO: control_directives - "\t.end_amd_kernel_code_t\n"; - + OS << "\t.amd_kernel_code_t\n"; + dumpAmdKernelCode(&Header, OS, "\t\t"); + OS << "\t.end_amd_kernel_code_t\n"; } void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, @@ -241,35 +93,63 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal( OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n'; } +void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(Module &M) { + OS << "\t.amdgpu_runtime_metadata\n"; + OS << getRuntimeMDYAMLString(M); + OS << "\n\t.end_amdgpu_runtime_metadata\n"; +} + +void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(StringRef Metadata) { + OS << "\t.amdgpu_runtime_metadata"; + OS << Metadata; + OS << "\t.end_amdgpu_runtime_metadata\n"; +} + //===----------------------------------------------------------------------===// // AMDGPUTargetELFStreamer //===----------------------------------------------------------------------===// AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S) - : AMDGPUTargetStreamer(S), Streamer(S) { } + : AMDGPUTargetStreamer(S), Streamer(S) {} MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { return static_cast<MCELFStreamer &>(Streamer); } void +AMDGPUTargetELFStreamer::EmitAMDGPUNote(const MCExpr* DescSZ, + PT_NOTE::NoteType Type, + std::function<void(MCELFStreamer &)> EmitDesc) { + auto &S = getStreamer(); + auto &Context = S.getContext(); + + auto NameSZ = sizeof(PT_NOTE::NoteName); + + S.PushSection(); + S.SwitchSection(Context.getELFSection( + PT_NOTE::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC)); + S.EmitIntValue(NameSZ, 4); // namesz + S.EmitValue(DescSZ, 4); // descz + S.EmitIntValue(Type, 4); // type + S.EmitBytes(StringRef(PT_NOTE::NoteName, NameSZ)); // name + S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 + EmitDesc(S); // desc + S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 + S.PopSection(); +} + +void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major, uint32_t Minor) { - MCStreamer &OS = getStreamer(); - MCSectionELF *Note = OS.getContext().getELFSection(".note", ELF::SHT_NOTE, 0); - - unsigned NameSZ = 4; - OS.PushSection(); - OS.SwitchSection(Note); - OS.EmitIntValue(NameSZ, 4); // namesz - OS.EmitIntValue(8, 4); // descz - OS.EmitIntValue(NT_AMDGPU_HSA_CODE_OBJECT_VERSION, 4); // type - OS.EmitBytes(StringRef("AMD", NameSZ)); // name - OS.EmitIntValue(Major, 4); // desc - OS.EmitIntValue(Minor, 4); - OS.EmitValueToAlignment(4); - OS.PopSection(); + EmitAMDGPUNote( + MCConstantExpr::create(8, getContext()), + PT_NOTE::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, + [&](MCELFStreamer &OS){ + OS.EmitIntValue(Major, 4); + OS.EmitIntValue(Minor, 4); + } + ); } void @@ -278,33 +158,28 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Stepping, StringRef VendorName, StringRef ArchName) { - MCStreamer &OS = getStreamer(); - MCSectionELF *Note = OS.getContext().getELFSection(".note", ELF::SHT_NOTE, 0); - - unsigned NameSZ = 4; uint16_t VendorNameSize = VendorName.size() + 1; uint16_t ArchNameSize = ArchName.size() + 1; + unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) + - sizeof(Major) + sizeof(Minor) + sizeof(Stepping) + - VendorNameSize + ArchNameSize; - - OS.PushSection(); - OS.SwitchSection(Note); - OS.EmitIntValue(NameSZ, 4); // namesz - OS.EmitIntValue(DescSZ, 4); // descsz - OS.EmitIntValue(NT_AMDGPU_HSA_ISA, 4); // type - OS.EmitBytes(StringRef("AMD", 4)); // name - OS.EmitIntValue(VendorNameSize, 2); // desc - OS.EmitIntValue(ArchNameSize, 2); - OS.EmitIntValue(Major, 4); - OS.EmitIntValue(Minor, 4); - OS.EmitIntValue(Stepping, 4); - OS.EmitBytes(VendorName); - OS.EmitIntValue(0, 1); // NULL terminate VendorName - OS.EmitBytes(ArchName); - OS.EmitIntValue(0, 1); // NULL terminte ArchName - OS.EmitValueToAlignment(4); - OS.PopSection(); + sizeof(Major) + sizeof(Minor) + sizeof(Stepping) + + VendorNameSize + ArchNameSize; + + EmitAMDGPUNote( + MCConstantExpr::create(DescSZ, getContext()), + PT_NOTE::NT_AMDGPU_HSA_ISA, + [&](MCELFStreamer &OS) { + OS.EmitIntValue(VendorNameSize, 2); + OS.EmitIntValue(ArchNameSize, 2); + OS.EmitIntValue(Major, 4); + OS.EmitIntValue(Minor, 4); + OS.EmitIntValue(Stepping, 4); + OS.EmitBytes(VendorName); + OS.EmitIntValue(0, 1); // NULL terminate VendorName + OS.EmitBytes(ArchName); + OS.EmitIntValue(0, 1); // NULL terminte ArchName + } + ); } void @@ -340,3 +215,28 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal( Symbol->setType(ELF::STT_OBJECT); Symbol->setBinding(ELF::STB_GLOBAL); } + +void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) { + // Create two labels to mark the beginning and end of the desc field + // and a MCExpr to calculate the size of the desc field. + auto &Context = getContext(); + auto *DescBegin = Context.createTempSymbol(); + auto *DescEnd = Context.createTempSymbol(); + auto *DescSZ = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(DescEnd, Context), + MCSymbolRefExpr::create(DescBegin, Context), Context); + + EmitAMDGPUNote( + DescSZ, + PT_NOTE::NT_AMDGPU_HSA_RUNTIME_METADATA, + [&](MCELFStreamer &OS) { + OS.EmitLabel(DescBegin); + OS.EmitBytes(Metadata); + OS.EmitLabel(DescEnd); + } + ); +} + +void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(Module &M) { + EmitRuntimeMetadata(getRuntimeMDYAMLString(M)); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index b3d59e8..e2f2058 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -14,11 +14,20 @@ #include "llvm/MC/MCStreamer.h" namespace llvm { +#include "AMDGPUPTNote.h" +class DataLayout; +class Function; class MCELFStreamer; class MCSymbol; +class MDNode; +class Module; +class Type; class AMDGPUTargetStreamer : public MCTargetStreamer { +protected: + MCContext &getContext() const { return Streamer.getContext(); } + public: AMDGPUTargetStreamer(MCStreamer &S); virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major, @@ -36,6 +45,10 @@ public: virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0; virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0; + + virtual void EmitRuntimeMetadata(Module &M) = 0; + + virtual void EmitRuntimeMetadata(StringRef Metadata) = 0; }; class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer { @@ -56,23 +69,19 @@ public: void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; -}; -class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer { + void EmitRuntimeMetadata(Module &M) override; - enum NoteType { - NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1, - NT_AMDGPU_HSA_HSAIL = 2, - NT_AMDGPU_HSA_ISA = 3, - NT_AMDGPU_HSA_PRODUCER = 4, - NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5, - NT_AMDGPU_HSA_EXTENSION = 6, - NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101, - NT_AMDGPU_HSA_HLDEBUG_TARGET = 102 - }; + void EmitRuntimeMetadata(StringRef Metadata) override; +}; +class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer { MCStreamer &Streamer; + void EmitAMDGPUNote(const MCExpr* DescSize, + AMDGPU::PT_NOTE::NoteType Type, + std::function<void(MCELFStreamer &)> EmitDesc); + public: AMDGPUTargetELFStreamer(MCStreamer &S); @@ -92,6 +101,10 @@ public: void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; + + void EmitRuntimeMetadata(Module &M) override; + + void EmitRuntimeMetadata(StringRef Metadata) override; }; } diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 5e8e6ce..6015ec1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -20,26 +20,30 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCFixup.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cstdint> using namespace llvm; namespace { class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { - R600MCCodeEmitter(const R600MCCodeEmitter &) = delete; - void operator=(const R600MCCodeEmitter &) = delete; - const MCInstrInfo &MCII; const MCRegisterInfo &MRI; public: R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri) - : MCII(mcii), MRI(mri) { } + : AMDGPUMCCodeEmitter(mcii), MRI(mri) {} + R600MCCodeEmitter(const R600MCCodeEmitter &) = delete; + R600MCCodeEmitter &operator=(const R600MCCodeEmitter &) = delete; /// \brief Encode the instruction and write it to the OS. void encodeInstruction(const MCInst &MI, raw_ostream &OS, @@ -58,7 +62,7 @@ private: unsigned getHWReg(unsigned regNo) const; }; -} // End anonymous namespace +} // end anonymous namespace enum RegElement { ELEMENT_X = 0, @@ -86,6 +90,9 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { + verifyInstructionPredicates(MI, + computeAvailableFeatures(STI.getFeatureBits())); + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); if (MI.getOpcode() == AMDGPU::RETURN || MI.getOpcode() == AMDGPU::FETCH_CLAUSE || @@ -178,4 +185,5 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, return MO.getImm(); } +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AMDGPUGenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 71b585c..0c5bb06 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -1,4 +1,4 @@ -//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===// +//===-- SIMCCodeEmitter.cpp - SI Code Emitter -----------------------------===// // // The LLVM Compiler Infrastructure // @@ -17,38 +17,42 @@ #include "MCTargetDesc/AMDGPUFixupKinds.h" #include "MCTargetDesc/AMDGPUMCCodeEmitter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIDefines.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cstdint> +#include <cstdlib> using namespace llvm; namespace { class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { - SIMCCodeEmitter(const SIMCCodeEmitter &) = delete; - void operator=(const SIMCCodeEmitter &) = delete; - const MCInstrInfo &MCII; const MCRegisterInfo &MRI; - /// \brief Can this operand also contain immediate values? - bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; - /// \brief Encode an fp or int literal - uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const; + uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo, + const MCSubtargetInfo &STI) const; public: SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, MCContext &ctx) - : MCII(mcii), MRI(mri) { } - - ~SIMCCodeEmitter() override {} + : AMDGPUMCCodeEmitter(mcii), MRI(mri) {} + SIMCCodeEmitter(const SIMCCodeEmitter &) = delete; + SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete; /// \brief Encode the instruction and write it to the OS. void encodeInstruction(const MCInst &MI, raw_ostream &OS, @@ -67,7 +71,7 @@ public: const MCSubtargetInfo &STI) const override; }; -} // End anonymous namespace +} // end anonymous namespace MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, @@ -75,14 +79,6 @@ MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, return new SIMCCodeEmitter(MCII, MRI, Ctx); } -bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc, - unsigned OpNo) const { - unsigned OpType = Desc.OpInfo[OpNo].OperandType; - - return OpType == AMDGPU::OPERAND_REG_IMM32 || - OpType == AMDGPU::OPERAND_REG_INLINE_C; -} - // Returns the encoding value to use if the given integer is an integer inline // immediate value, or 0 if it is not. template <typename IntTy> @@ -96,7 +92,43 @@ static uint32_t getIntInlineImmEncoding(IntTy Imm) { return 0; } -static uint32_t getLit32Encoding(uint32_t Val) { +static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) { + uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == 0x3800) // 0.5 + return 240; + + if (Val == 0xB800) // -0.5 + return 241; + + if (Val == 0x3C00) // 1.0 + return 242; + + if (Val == 0xBC00) // -1.0 + return 243; + + if (Val == 0x4000) // 2.0 + return 244; + + if (Val == 0xC000) // -2.0 + return 245; + + if (Val == 0x4400) // 4.0 + return 246; + + if (Val == 0xC400) // -4.0 + return 247; + + if (Val == 0x3118 && // 1.0 / (2.0 * pi) + STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) + return 248; + + return 255; +} + +static uint32_t getLit32Encoding(uint32_t Val, const MCSubtargetInfo &STI) { uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val)); if (IntImm != 0) return IntImm; @@ -125,10 +157,14 @@ static uint32_t getLit32Encoding(uint32_t Val) { if (Val == FloatToBits(-4.0f)) return 247; + if (Val == 0x3e22f983 && // 1.0 / (2.0 * pi) + STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) + return 248; + return 255; } -static uint32_t getLit64Encoding(uint64_t Val) { +static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI) { uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val)); if (IntImm != 0) return IntImm; @@ -157,15 +193,19 @@ static uint32_t getLit64Encoding(uint64_t Val) { if (Val == DoubleToBits(-4.0)) return 247; + if (Val == 0x3fc45f306dc9c882 && // 1.0 / (2.0 * pi) + STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) + return 248; + return 255; } uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, - unsigned OpSize) const { - + const MCOperandInfo &OpInfo, + const MCSubtargetInfo &STI) const { int64_t Imm; if (MO.isExpr()) { - const MCConstantExpr *C = dyn_cast<MCConstantExpr>(MO.getExpr()); + const auto *C = dyn_cast<MCConstantExpr>(MO.getExpr()); if (!C) return 255; @@ -180,17 +220,23 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, Imm = MO.getImm(); } - if (OpSize == 4) - return getLit32Encoding(static_cast<uint32_t>(Imm)); - - assert(OpSize == 8); - - return getLit64Encoding(static_cast<uint64_t>(Imm)); + switch (AMDGPU::getOperandSize(OpInfo)) { + case 4: + return getLit32Encoding(static_cast<uint32_t>(Imm), STI); + case 8: + return getLit64Encoding(static_cast<uint64_t>(Imm), STI); + case 2: + return getLit16Encoding(static_cast<uint16_t>(Imm), STI); + default: + llvm_unreachable("invalid operand size"); + } } void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { + verifyInstructionPredicates(MI, + computeAvailableFeatures(STI.getFeatureBits())); uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI); const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); @@ -207,15 +253,12 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) { // Check if this operand should be encoded as [SV]Src - if (!isSrcOperand(Desc, i)) + if (!AMDGPU::isSISrcOperand(Desc, i)) continue; - int RCID = Desc.OpInfo[i].RegClass; - const MCRegisterClass &RC = MRI.getRegClass(RCID); - // Is this operand a literal immediate? const MCOperand &Op = MI.getOperand(i); - if (getLitEncoding(Op, RC.getSize()) != 255) + if (getLitEncoding(Op, Desc.OpInfo[i], STI) != 255) continue; // Yes! Encode it @@ -224,7 +267,7 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, if (Op.isImm()) Imm = Op.getImm(); else if (Op.isExpr()) { - if (const MCConstantExpr *C = dyn_cast<MCConstantExpr>(Op.getExpr())) + if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr())) Imm = C->getValue(); } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. @@ -262,7 +305,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, return MRI.getEncodingValue(MO.getReg()); if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) { - const MCSymbolRefExpr *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr()); + const auto *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr()); MCFixupKind Kind; if (Expr && Expr->getSymbol().isExternal()) Kind = FK_Data_4; @@ -279,11 +322,8 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, } const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - if (isSrcOperand(Desc, OpNo)) { - int RCID = Desc.OpInfo[OpNo].RegClass; - const MCRegisterClass &RC = MRI.getRegClass(RCID); - - uint32_t Enc = getLitEncoding(MO, RC.getSize()); + if (AMDGPU::isSISrcOperand(Desc, OpNo)) { + uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI); if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) return Enc; @@ -293,4 +333,3 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, llvm_unreachable("Encoding of this operand type is not supported yet."); return 0; } - diff --git a/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td new file mode 100644 index 0000000..46803e5 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -0,0 +1,763 @@ +//===-- MIMGInstructions.td - MIMG Instruction Defintions -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +class MIMG_Mask <string op, int channels> { + string Op = op; + int Channels = channels; +} + +class mimg <bits<7> si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; +} + +class MIMG_Helper <dag outs, dag ins, string asm, + string dns=""> : MIMG<outs, ins, asm,[]> { + let mayLoad = 1; + let mayStore = 0; + let hasPostISelHook = 1; + let DecoderNamespace = dns; + let isAsmParserOnly = !if(!eq(dns,""), 1, 0); + let AsmMatchConverter = "cvtMIMG"; + let usesCustomInserter = 1; +} + +class MIMG_NoSampler_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + RegisterClass addr_rc, + string dns=""> : MIMG_Helper < + (outs dst_rc:$vdata), + (ins addr_rc:$vaddr, SReg_256:$srsrc, + dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", + dns>, MIMGe<op> { + let ssamp = 0; +} + +multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + int channels> { + def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32, + !if(!eq(channels, 1), "AMDGPU", "")>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>, + MIMG_Mask<asm#"_V4", channels>; +} + +multiclass MIMG_NoSampler <bits<7> op, string asm> { + defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>; + defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>; + defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>; + defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>; +} + +class MIMG_Store_Helper <bits<7> op, string asm, + RegisterClass data_rc, + RegisterClass addr_rc> : MIMG_Helper < + (outs), + (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + >, MIMGe<op> { + let ssamp = 0; + let mayLoad = 1; // TableGen requires this for matching with the intrinsics + let mayStore = 1; + let hasSideEffects = 1; + let hasPostISelHook = 0; + let DisableWQM = 1; +} + +multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm, + RegisterClass data_rc, + int channels> { + def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>, + MIMG_Mask<asm#"_V4", channels>; +} + +multiclass MIMG_Store <bits<7> op, string asm> { + defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>; + defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>; + defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>; + defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>; +} + +class MIMG_Atomic_Helper <string asm, RegisterClass data_rc, + RegisterClass addr_rc> : MIMG_Helper < + (outs data_rc:$vdst), + (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + > { + let mayStore = 1; + let hasSideEffects = 1; + let hasPostISelHook = 0; + let DisableWQM = 1; + let Constraints = "$vdst = $vdata"; + let AsmMatchConverter = "cvtMIMGAtomic"; +} + +class MIMG_Atomic_Real_si<mimg op, string name, string asm, + RegisterClass data_rc, RegisterClass addr_rc> : + MIMG_Atomic_Helper<asm, data_rc, addr_rc>, + SIMCInstr<name, SIEncodingFamily.SI>, + MIMGe<op.SI> { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} + +class MIMG_Atomic_Real_vi<mimg op, string name, string asm, + RegisterClass data_rc, RegisterClass addr_rc> : + MIMG_Atomic_Helper<asm, data_rc, addr_rc>, + SIMCInstr<name, SIEncodingFamily.VI>, + MIMGe<op.VI> { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; +} + +multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm, + RegisterClass data_rc, RegisterClass addr_rc> { + let isPseudo = 1, isCodeGenOnly = 1 in { + def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>, + SIMCInstr<name, SIEncodingFamily.NONE>; + } + + let ssamp = 0 in { + def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>; + + def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>; + } +} + +multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> { + defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>; + defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>; + defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>; +} + +class MIMG_Sampler_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc, + bit wqm, + string dns=""> : MIMG_Helper < + (outs dst_rc:$vdata), + (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", + dns>, MIMGe<op> { + let WQM = wqm; +} + +multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + int channels, bit wqm> { + def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm, + !if(!eq(channels, 1), "AMDGPU", "")>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>, + MIMG_Mask<asm#"_V4", channels>; + def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>, + MIMG_Mask<asm#"_V8", channels>; + def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>, + MIMG_Mask<asm#"_V16", channels>; +} + +multiclass MIMG_Sampler <bits<7> op, string asm, bit wqm=0> { + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>; + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>; + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>; +} + +multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>; + +class MIMG_Gather_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc, bit wqm> : MIMG < + (outs dst_rc:$vdata), + (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", + []>, MIMGe<op> { + let mayLoad = 1; + let mayStore = 0; + + // DMASK was repurposed for GATHER4. 4 components are always + // returned and DMASK works like a swizzle - it selects + // the component to fetch. The only useful DMASK values are + // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns + // (red,red,red,red) etc.) The ISA document doesn't mention + // this. + // Therefore, disable all code which updates DMASK by setting this: + let Gather4 = 1; + let hasPostISelHook = 0; + let WQM = wqm; + + let isAsmParserOnly = 1; // TBD: fix it later +} + +multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + int channels, bit wqm> { + def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>, + MIMG_Mask<asm#"_V4", channels>; + def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>, + MIMG_Mask<asm#"_V8", channels>; + def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>, + MIMG_Mask<asm#"_V16", channels>; +} + +multiclass MIMG_Gather <bits<7> op, string asm, bit wqm=0> { + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>; + defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>; + defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>; + defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>; +} + +multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>; + +//===----------------------------------------------------------------------===// +// MIMG Instructions +//===----------------------------------------------------------------------===// +let SubtargetPredicate = isGCN in { +defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">; +defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; +//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>; +//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; +//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; +//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; +defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">; +defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">; +//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; +//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; +defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; +defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">; +defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>; +defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">; +defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">; +//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI +defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimg<0x14>, "image_atomic_smin">; +defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimg<0x15>, "image_atomic_umin">; +defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimg<0x16>, "image_atomic_smax">; +defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimg<0x17>, "image_atomic_umax">; +defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimg<0x18>, "image_atomic_and">; +defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">; +defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">; +defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">; +defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">; +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI +//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI +//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; +defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; +defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; +defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; +defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">; +//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; +//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; +} + +/********** ======================= **********/ +/********** Image sampling patterns **********/ +/********** ======================= **********/ + +// Image + sampler +class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, + i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (opcode $addr, $rsrc, $sampler, + (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) +>; + +multiclass SampleRawPatterns<SDPatternOperator name, string opcode> { + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>; +} + +// Image + sampler for amdgcn +// TODO: +// 1. Handle half data type like v4f16, and add D16 bit support; +// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128). +// 3. Add A16 support when we pass address of half type. +multiclass AMDGCNSamplePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> { + def : Pat< + (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc, + i1:$slc, i1:$lwe, i1:$da)), + (opcode $addr, $rsrc, $sampler, + (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), + 0, 0, (as_i1imm $lwe), (as_i1imm $da)) + >; +} + +multiclass AMDGCNSampleDataPatterns<SDPatternOperator name, string opcode, ValueType dt> { + defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V1), dt, f32>; + defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V2), dt, v2f32>; + defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V4), dt, v4f32>; + defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V8), dt, v8f32>; + defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V16), dt, v16f32>; +} + +// TODO: support v3f32. +multiclass AMDGCNSamplePatterns<SDPatternOperator name, string opcode> { + defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V1), f32>; + defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V2), v2f32>; + defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V4), v4f32>; +} + +// Image only +class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm, + imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe), + (opcode $addr, $rsrc, + (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) +>; + +multiclass ImagePatterns<SDPatternOperator name, string opcode> { + def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; + def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; + def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; +} + +multiclass ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> { + def : Pat < + (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe, + i1:$da)), + (opcode $addr, $rsrc, + (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), + 0, 0, (as_i1imm $lwe), (as_i1imm $da)) + >; +} + +multiclass ImageLoadDataPatterns<SDPatternOperator name, string opcode, ValueType dt> { + defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V1), dt, i32>; + defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>; + defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>; +} + +// TODO: support v3f32. +multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> { + defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f32>; + defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2f32>; + defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V4), v4f32>; +} + +multiclass ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> { + def : Pat < + (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, + i1:$lwe, i1:$da), + (opcode $data, $addr, $rsrc, + (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), + 0, 0, (as_i1imm $lwe), (as_i1imm $da)) + >; +} + +multiclass ImageStoreDataPatterns<SDPatternOperator name, string opcode, ValueType dt> { + defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V1), dt, i32>; + defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>; + defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>; +} + +// TODO: support v3f32. +multiclass ImageStorePatterns<SDPatternOperator name, string opcode> { + defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), f32>; + defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V2), v2f32>; + defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V4), v4f32>; +} + +class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), + (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)) +>; + +multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> { + def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>; + def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>; + def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>; +} + +class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat < + (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc, + imm:$r128, imm:$da, imm:$slc), + (EXTRACT_SUBREG + (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1), + $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)), + sub0) +>; + +// ======= SI Image Intrinsics ================ + +// Image load +defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">; +defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">; +def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>; + +// Basic sample +defm : SampleRawPatterns<int_SI_image_sample, "IMAGE_SAMPLE">; +defm : SampleRawPatterns<int_SI_image_sample_cl, "IMAGE_SAMPLE_CL">; +defm : SampleRawPatterns<int_SI_image_sample_d, "IMAGE_SAMPLE_D">; +defm : SampleRawPatterns<int_SI_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">; +defm : SampleRawPatterns<int_SI_image_sample_l, "IMAGE_SAMPLE_L">; +defm : SampleRawPatterns<int_SI_image_sample_b, "IMAGE_SAMPLE_B">; +defm : SampleRawPatterns<int_SI_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">; +defm : SampleRawPatterns<int_SI_image_sample_lz, "IMAGE_SAMPLE_LZ">; +defm : SampleRawPatterns<int_SI_image_sample_cd, "IMAGE_SAMPLE_CD">; +defm : SampleRawPatterns<int_SI_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">; + +// Sample with comparison +defm : SampleRawPatterns<int_SI_image_sample_c, "IMAGE_SAMPLE_C">; +defm : SampleRawPatterns<int_SI_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">; +defm : SampleRawPatterns<int_SI_image_sample_c_d, "IMAGE_SAMPLE_C_D">; +defm : SampleRawPatterns<int_SI_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">; +defm : SampleRawPatterns<int_SI_image_sample_c_l, "IMAGE_SAMPLE_C_L">; +defm : SampleRawPatterns<int_SI_image_sample_c_b, "IMAGE_SAMPLE_C_B">; +defm : SampleRawPatterns<int_SI_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">; +defm : SampleRawPatterns<int_SI_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">; + +// Sample with offsets +defm : SampleRawPatterns<int_SI_image_sample_o, "IMAGE_SAMPLE_O">; +defm : SampleRawPatterns<int_SI_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_d_o, "IMAGE_SAMPLE_D_O">; +defm : SampleRawPatterns<int_SI_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_l_o, "IMAGE_SAMPLE_L_O">; +defm : SampleRawPatterns<int_SI_image_sample_b_o, "IMAGE_SAMPLE_B_O">; +defm : SampleRawPatterns<int_SI_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">; +defm : SampleRawPatterns<int_SI_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">; +defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">; + +// Sample with comparison and offsets +defm : SampleRawPatterns<int_SI_image_sample_c_o, "IMAGE_SAMPLE_C_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">; + +// Gather opcodes +// Only the variants which make sense are defined. +def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl, IMAGE_GATHER4_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_l, IMAGE_GATHER4_L_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b, IMAGE_GATHER4_B_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_c, IMAGE_GATHER4_C_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_cl, IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_lz, IMAGE_GATHER4_C_LZ_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_o, IMAGE_GATHER4_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_b_cl_o, IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_lz_o, IMAGE_GATHER4_LZ_O_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_cl_o, IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_l_o, IMAGE_GATHER4_C_L_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_o, IMAGE_GATHER4_C_B_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_cl_o, IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>; + +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>; +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>; + +// ======= amdgcn Image Intrinsics ============== + +// Image load +defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">; +defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">; +defm : ImageLoadPatterns<int_amdgcn_image_getresinfo, "IMAGE_GET_RESINFO">; + +// Image store +defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">; +defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">; + +// Basic sample +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample, "IMAGE_SAMPLE">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl, "IMAGE_SAMPLE_CL">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d, "IMAGE_SAMPLE_D">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l, "IMAGE_SAMPLE_L">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b, "IMAGE_SAMPLE_B">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz, "IMAGE_SAMPLE_LZ">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd, "IMAGE_SAMPLE_CD">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">; + +// Sample with comparison +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c, "IMAGE_SAMPLE_C">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d, "IMAGE_SAMPLE_C_D">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l, "IMAGE_SAMPLE_C_L">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b, "IMAGE_SAMPLE_C_B">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">; + +// Sample with offsets +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_o, "IMAGE_SAMPLE_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_o, "IMAGE_SAMPLE_D_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l_o, "IMAGE_SAMPLE_L_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_o, "IMAGE_SAMPLE_B_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">; + +// Sample with comparison and offsets +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_o, "IMAGE_SAMPLE_C_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">; + +// Gather opcodes +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4, "IMAGE_GATHER4">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl, "IMAGE_GATHER4_CL">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l, "IMAGE_GATHER4_L">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b, "IMAGE_GATHER4_B">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl, "IMAGE_GATHER4_B_CL">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz, "IMAGE_GATHER4_LZ">; + +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c, "IMAGE_GATHER4_C">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl, "IMAGE_GATHER4_C_CL">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l, "IMAGE_GATHER4_C_L">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b, "IMAGE_GATHER4_C_B">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl, "IMAGE_GATHER4_C_B_CL">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz, "IMAGE_GATHER4_C_LZ">; + +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_o, "IMAGE_GATHER4_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl_o, "IMAGE_GATHER4_CL_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l_o, "IMAGE_GATHER4_L_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_o, "IMAGE_GATHER4_B_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl_o, "IMAGE_GATHER4_B_CL_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz_o, "IMAGE_GATHER4_LZ_O">; + +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_o, "IMAGE_GATHER4_C_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl_o, "IMAGE_GATHER4_C_CL_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l_o, "IMAGE_GATHER4_C_L_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_o, "IMAGE_GATHER4_C_B_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl_o, "IMAGE_GATHER4_C_B_CL_O">; +defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz_o, "IMAGE_GATHER4_C_LZ_O">; + +defm : AMDGCNSamplePatterns<int_amdgcn_image_getlod, "IMAGE_GET_LOD">; + +// Image atomics +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">; +def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>; +def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>; +def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">; + +/* SIsample for simple 1D texture lookup */ +def : Pat < + (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm), + (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) +>; + +class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm), + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) +>; + +class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT), + (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0) +>; + +class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY), + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) +>; + +class SampleShadowPattern<SDNode name, MIMG opcode, + ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW), + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) +>; + +class SampleShadowArrayPattern<SDNode name, MIMG opcode, + ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) +>; + +/* SIsample* for texture lookups consuming more address parameters */ +multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l, + MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b, +MIMG sample_d, MIMG sample_c_d, ValueType addr_type> { + def : SamplePattern <SIsample, sample, addr_type>; + def : SampleRectPattern <SIsample, sample, addr_type>; + def : SampleArrayPattern <SIsample, sample, addr_type>; + def : SampleShadowPattern <SIsample, sample_c, addr_type>; + def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>; + + def : SamplePattern <SIsamplel, sample_l, addr_type>; + def : SampleArrayPattern <SIsamplel, sample_l, addr_type>; + def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>; + def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>; + + def : SamplePattern <SIsampleb, sample_b, addr_type>; + def : SampleArrayPattern <SIsampleb, sample_b, addr_type>; + def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>; + def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>; + + def : SamplePattern <SIsampled, sample_d, addr_type>; + def : SampleArrayPattern <SIsampled, sample_d, addr_type>; + def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>; + def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>; +} + +defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2, + IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2, + IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2, + IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2, + v2i32>; +defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4, + IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4, + IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4, + IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4, + v4i32>; +defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8, + IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8, + IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8, + IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8, + v8i32>; +defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16, + IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16, + IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16, + IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16, + v16i32>; diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td index f5f1eb1..3c07cc7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Processors.td +++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td @@ -101,55 +101,89 @@ def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>; //===----------------------------------------------------------------------===// def : ProcessorModel<"bonaire", SIQuarterSpeedModel, - [FeatureSeaIslands, FeatureLDSBankCount32, FeatureISAVersion7_0_0] + [FeatureISAVersion7_0_0] >; def : ProcessorModel<"kabini", SIQuarterSpeedModel, - [FeatureSeaIslands, FeatureLDSBankCount16] + [FeatureISAVersion7_0_2] >; def : ProcessorModel<"kaveri", SIQuarterSpeedModel, - [FeatureSeaIslands, FeatureLDSBankCount32, FeatureISAVersion7_0_0] + [FeatureISAVersion7_0_0] >; -def : ProcessorModel<"hawaii", SIFullSpeedModel, - [FeatureSeaIslands, FeatureFastFMAF32, HalfRate64Ops, - FeatureLDSBankCount32, FeatureISAVersion7_0_1] +def : ProcessorModel<"hawaii", SIFullSpeedModel, + [FeatureISAVersion7_0_1] >; def : ProcessorModel<"mullins", SIQuarterSpeedModel, - [FeatureSeaIslands, FeatureLDSBankCount16]>; + [FeatureISAVersion7_0_2]>; + +def : ProcessorModel<"gfx700", SIQuarterSpeedModel, + [FeatureISAVersion7_0_0] +>; + +def : ProcessorModel<"gfx701", SIFullSpeedModel, + [FeatureISAVersion7_0_1] +>; + +def : ProcessorModel<"gfx702", SIQuarterSpeedModel, + [FeatureISAVersion7_0_2] +>; //===----------------------------------------------------------------------===// // Volcanic Islands //===----------------------------------------------------------------------===// def : ProcessorModel<"tonga", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0, - FeatureLDSBankCount32] + [FeatureISAVersion8_0_2] >; def : ProcessorModel<"iceland", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0, - FeatureLDSBankCount32] + [FeatureISAVersion8_0_0] >; def : ProcessorModel<"carrizo", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32] + [FeatureISAVersion8_0_1] >; -def : ProcessorModel<"fiji", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureISAVersion8_0_3, FeatureLDSBankCount32] +def : ProcessorModel<"fiji", SIQuarterSpeedModel, + [FeatureISAVersion8_0_3] >; -def : ProcessorModel<"stoney", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16] +def : ProcessorModel<"stoney", SIQuarterSpeedModel, + [FeatureISAVersion8_1_0] >; def : ProcessorModel<"polaris10", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32] + [FeatureISAVersion8_0_3] >; def : ProcessorModel<"polaris11", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32] + [FeatureISAVersion8_0_3] +>; + +def : ProcessorModel<"gfx800", SIQuarterSpeedModel, + [FeatureISAVersion8_0_0] +>; + +def : ProcessorModel<"gfx801", SIQuarterSpeedModel, + [FeatureISAVersion8_0_1] >; + +def : ProcessorModel<"gfx802", SIQuarterSpeedModel, + [FeatureISAVersion8_0_2] +>; + +def : ProcessorModel<"gfx803", SIQuarterSpeedModel, + [FeatureISAVersion8_0_3] +>; + +def : ProcessorModel<"gfx804", SIQuarterSpeedModel, + [FeatureISAVersion8_0_4] +>; + +def : ProcessorModel<"gfx810", SIQuarterSpeedModel, + [FeatureISAVersion8_1_0] +>; + diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp index 3ccde79..d0aba38 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -66,7 +66,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override; + StringRef getPassName() const override; }; char R600ClauseMergePass::ID = 0; @@ -201,7 +201,7 @@ bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { return false; } -const char *R600ClauseMergePass::getPassName() const { +StringRef R600ClauseMergePass::getPassName() const { return "R600 Merge Clause Markers Pass"; } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index d5bda4a..45b36d3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -354,10 +354,10 @@ private: if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X) continue; int64_t Imm = Src.second; - std::vector<MachineOperand*>::iterator It = - std::find_if(Lits.begin(), Lits.end(), - [&](MachineOperand* val) - { return val->isImm() && (val->getImm() == Imm);}); + std::vector<MachineOperand *>::iterator It = + find_if(Lits, [&](MachineOperand *val) { + return val->isImm() && (val->getImm() == Imm); + }); // Get corresponding Operand MachineOperand &Operand = MI.getOperand( @@ -450,27 +450,24 @@ private: return ClauseFile(&ClauseHead, std::move(ClauseContent)); } - void - EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, - unsigned &CfCount) { + void EmitFetchClause(MachineBasicBlock::iterator InsertPos, + const DebugLoc &DL, ClauseFile &Clause, + unsigned &CfCount) { CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); - BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) - .addImm(CfCount); + BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount); for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { BB->splice(InsertPos, BB, Clause.second[i]); } CfCount += 2 * Clause.second.size(); } - void - EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, - unsigned &CfCount) { + void EmitALUClause(MachineBasicBlock::iterator InsertPos, const DebugLoc &DL, + ClauseFile &Clause, unsigned &CfCount) { Clause.first->getOperand(0).setImm(0); CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); - BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) - .addImm(CfCount); + BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount); for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { BB->splice(InsertPos, BB, Clause.second[i]); } @@ -644,17 +641,18 @@ public: break; } case AMDGPU::RETURN: { - BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); + DebugLoc DL = MBB.findDebugLoc(MI); + BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END)); CfCount++; if (CfCount % 2) { - BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); + BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD)); CfCount++; } MI->eraseFromParent(); for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) - EmitFetchClause(I, FetchClauses[i], CfCount); + EmitFetchClause(I, DL, FetchClauses[i], CfCount); for (unsigned i = 0, e = AluClauses.size(); i < e; i++) - EmitALUClause(I, AluClauses[i], CfCount); + EmitALUClause(I, DL, AluClauses[i], CfCount); break; } default: @@ -680,13 +678,13 @@ public: .addImm(Alu->getOperand(8).getImm()); Alu->eraseFromParent(); } - MFI->StackSize = CFStack.MaxStackSize; + MFI->CFStackSize = CFStack.MaxStackSize; } return false; } - const char *getPassName() const override { + StringRef getPassName() const override { return "R600 Control Flow Finalizer Pass"; } }; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index 93ed5be..9a5db6c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -307,7 +307,7 @@ public: BB != BB_E; ++BB) { MachineBasicBlock &MBB = *BB; MachineBasicBlock::iterator I = MBB.begin(); - if (I->getOpcode() == AMDGPU::CF_ALU) + if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU) continue; // BB was already parsed for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { if (isALU(*I)) @@ -319,7 +319,7 @@ public: return false; } - const char *getPassName() const override { + StringRef getPassName() const override { return "R600 Emit Clause Markers Pass"; } }; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index 0385b62..3e46e63 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -42,7 +42,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { + StringRef getPassName() const override { return "R600 Expand special instructions pass"; } }; @@ -116,85 +116,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { MI.eraseFromParent(); continue; } - - case AMDGPU::INTERP_PAIR_XY: { - MachineInstr *BMI; - unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( - MI.getOperand(2).getImm()); - - for (unsigned Chan = 0; Chan < 4; ++Chan) { - unsigned DstReg; - - if (Chan < 2) - DstReg = MI.getOperand(Chan).getReg(); - else - DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W; - - BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY, - DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); - - if (Chan > 0) { - BMI->bundleWithPred(); - } - if (Chan >= 2) - TII->addFlag(*BMI, 0, MO_FLAG_MASK); - if (Chan != 3) - TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST); - } - - MI.eraseFromParent(); - continue; - } - - case AMDGPU::INTERP_PAIR_ZW: { - MachineInstr *BMI; - unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( - MI.getOperand(2).getImm()); - - for (unsigned Chan = 0; Chan < 4; ++Chan) { - unsigned DstReg; - - if (Chan < 2) - DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y; - else - DstReg = MI.getOperand(Chan-2).getReg(); - - BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW, - DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); - - if (Chan > 0) { - BMI->bundleWithPred(); - } - if (Chan < 2) - TII->addFlag(*BMI, 0, MO_FLAG_MASK); - if (Chan != 3) - TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST); - } - - MI.eraseFromParent(); - continue; - } - - case AMDGPU::INTERP_VEC_LOAD: { - const R600RegisterInfo &TRI = TII->getRegisterInfo(); - MachineInstr *BMI; - unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( - MI.getOperand(1).getImm()); - unsigned DstReg = MI.getOperand(0).getReg(); - - for (unsigned Chan = 0; Chan < 4; ++Chan) { - BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0, - TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg); - if (Chan > 0) { - BMI->bundleWithPred(); - } - if (Chan != 3) - TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST); - } - - MI.eraseFromParent(); - continue; - } case AMDGPU::DOT_4: { const R600RegisterInfo &TRI = TII->getRegisterInfo(); diff --git a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp index dd5681f..5813786 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp @@ -11,5 +11,4 @@ using namespace llvm; -R600FrameLowering::~R600FrameLowering() { -} +R600FrameLowering::~R600FrameLowering() = default; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h index 5fe4e0d..874435f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h @@ -19,12 +19,14 @@ public: R600FrameLowering(StackDirection D, unsigned StackAl, int LAO, unsigned TransAl = 1) : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} - virtual ~R600FrameLowering(); + ~R600FrameLowering() override; - void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const {} - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {} + void emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const override {} + void emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const override {} }; -} +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 8ccd176..77fee435 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -17,16 +17,36 @@ #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "R600Defines.h" +#include "R600FrameLowering.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" -#include "llvm/Analysis/ValueTracking.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/DAGCombine.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/IR/Argument.h" -#include "llvm/IR/Function.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include <cassert> +#include <cstdint> +#include <iterator> +#include <utility> +#include <vector> using namespace llvm; @@ -72,7 +92,6 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); - setOperationAction(ISD::STORE, MVT::i8, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); @@ -80,6 +99,18 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::i32, MVT::i8, Custom); setTruncStoreAction(MVT::i32, MVT::i16, Custom); + // We need to include these since trunc STORES to PRIVATE need + // special handling to accommodate RMW + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom); + setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom); + setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom); // Workaround for LegalizeDAG asserting on expansion of i1 vector stores. setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand); @@ -192,12 +223,12 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setSchedulingPreference(Sched::Source); - setTargetDAGCombine(ISD::FP_ROUND); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::LOAD); } const R600Subtarget *R600TargetLowering::getSubtarget() const { @@ -205,13 +236,15 @@ const R600Subtarget *R600TargetLowering::getSubtarget() const { } static inline bool isEOP(MachineBasicBlock::iterator I) { + if (std::next(I) == I->getParent()->end()) + return false; return std::next(I)->getOpcode() == AMDGPU::RETURN; } MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { - MachineFunction * MF = BB->getParent(); + MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock::iterator I = MI; const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); @@ -278,10 +311,12 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .bitcastToAPInt() .getZExtValue()); break; + case AMDGPU::MOV_IMM_I32: TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1).getImm()); break; + case AMDGPU::MOV_IMM_GLOBAL_ADDR: { //TODO: Perhaps combine this instruction with the next if possible auto MIB = TII->buildDefaultInstruction( @@ -291,6 +326,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MIB->getOperand(Idx) = MI.getOperand(1); break; } + case AMDGPU::CONST_COPY: { MachineInstr *NewMI = TII->buildDefaultInstruction( *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); @@ -301,228 +337,20 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case AMDGPU::RAT_WRITE_CACHELESS_32_eg: case AMDGPU::RAT_WRITE_CACHELESS_64_eg: - case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { + case AMDGPU::RAT_WRITE_CACHELESS_128_eg: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) .addOperand(MI.getOperand(0)) .addOperand(MI.getOperand(1)) .addImm(isEOP(I)); // Set End of program bit break; - } - case AMDGPU::RAT_STORE_TYPED_eg: { + + case AMDGPU::RAT_STORE_TYPED_eg: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) .addOperand(MI.getOperand(0)) .addOperand(MI.getOperand(1)) .addOperand(MI.getOperand(2)) .addImm(isEOP(I)); // Set End of program bit break; - } - - case AMDGPU::TXD: { - unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - MachineOperand &RID = MI.getOperand(4); - MachineOperand &SID = MI.getOperand(5); - unsigned TextureId = MI.getOperand(6).getImm(); - unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; - unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; - - switch (TextureId) { - case 5: // Rect - CTX = CTY = 0; - break; - case 6: // Shadow1D - SrcW = SrcZ; - break; - case 7: // Shadow2D - SrcW = SrcZ; - break; - case 8: // ShadowRect - CTX = CTY = 0; - SrcW = SrcZ; - break; - case 9: // 1DArray - SrcZ = SrcY; - CTZ = 0; - break; - case 10: // 2DArray - CTZ = 0; - break; - case 11: // Shadow1DArray - SrcZ = SrcY; - CTZ = 0; - break; - case 12: // Shadow2DArray - CTZ = 0; - break; - } - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), - T0) - .addOperand(MI.getOperand(3)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), - T1) - .addOperand(MI.getOperand(2)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) - .addOperand(MI.getOperand(0)) - .addOperand(MI.getOperand(1)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW) - .addReg(T0, RegState::Implicit) - .addReg(T1, RegState::Implicit); - break; - } - - case AMDGPU::TXD_SHADOW: { - unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - MachineOperand &RID = MI.getOperand(4); - MachineOperand &SID = MI.getOperand(5); - unsigned TextureId = MI.getOperand(6).getImm(); - unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; - unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; - - switch (TextureId) { - case 5: // Rect - CTX = CTY = 0; - break; - case 6: // Shadow1D - SrcW = SrcZ; - break; - case 7: // Shadow2D - SrcW = SrcZ; - break; - case 8: // ShadowRect - CTX = CTY = 0; - SrcW = SrcZ; - break; - case 9: // 1DArray - SrcZ = SrcY; - CTZ = 0; - break; - case 10: // 2DArray - CTZ = 0; - break; - case 11: // Shadow1DArray - SrcZ = SrcY; - CTZ = 0; - break; - case 12: // Shadow2DArray - CTZ = 0; - break; - } - - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), - T0) - .addOperand(MI.getOperand(3)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), - T1) - .addOperand(MI.getOperand(2)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) - .addOperand(MI.getOperand(0)) - .addOperand(MI.getOperand(1)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW) - .addReg(T0, RegState::Implicit) - .addReg(T1, RegState::Implicit); - break; - } case AMDGPU::BRANCH: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) @@ -534,7 +362,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), AMDGPU::PREDICATE_BIT) .addOperand(MI.getOperand(1)) - .addImm(OPCODE_IS_NOT_ZERO) + .addImm(AMDGPU::PRED_SETNE) .addImm(0); // Flags TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) @@ -548,7 +376,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), AMDGPU::PREDICATE_BIT) .addOperand(MI.getOperand(1)) - .addImm(OPCODE_IS_NOT_ZERO_INT) + .addImm(AMDGPU::PRED_SETNE_INT) .addImm(0); // Flags TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) @@ -592,12 +420,6 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, break; } case AMDGPU::RETURN: { - // RETURN instructions must have the live-out registers as implicit uses, - // otherwise they appear dead. - R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); - MachineInstrBuilder MIB(*MF, MI); - for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) - MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); return BB; } } @@ -654,7 +476,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(2, DL, MVT::i32), // SWZ_Z DAG.getConstant(3, DL, MVT::i32) // SWZ_W }; - return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args); + return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args); } // default for switch(IntrinsicID) @@ -671,15 +493,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const switch(IntrinsicID) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case AMDGPUIntrinsic::r600_tex: - case AMDGPUIntrinsic::r600_texc: - case AMDGPUIntrinsic::r600_txl: - case AMDGPUIntrinsic::r600_txlc: - case AMDGPUIntrinsic::r600_txb: - case AMDGPUIntrinsic::r600_txbc: - case AMDGPUIntrinsic::r600_txf: - case AMDGPUIntrinsic::r600_txq: - case AMDGPUIntrinsic::r600_ddx: - case AMDGPUIntrinsic::r600_ddy: { + case AMDGPUIntrinsic::r600_texc: { unsigned TextureOp; switch (IntrinsicID) { case AMDGPUIntrinsic::r600_tex: @@ -688,32 +502,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case AMDGPUIntrinsic::r600_texc: TextureOp = 1; break; - case AMDGPUIntrinsic::r600_txl: - TextureOp = 2; - break; - case AMDGPUIntrinsic::r600_txlc: - TextureOp = 3; - break; - case AMDGPUIntrinsic::r600_txb: - TextureOp = 4; - break; - case AMDGPUIntrinsic::r600_txbc: - TextureOp = 5; - break; - case AMDGPUIntrinsic::r600_txf: - TextureOp = 6; - break; - case AMDGPUIntrinsic::r600_txq: - TextureOp = 7; - break; - case AMDGPUIntrinsic::r600_ddx: - TextureOp = 8; - break; - case AMDGPUIntrinsic::r600_ddy: - TextureOp = 9; - break; default: - llvm_unreachable("Unknow Texture Operation"); + llvm_unreachable("unhandled texture operation"); } SDValue TexArgs[19] = { @@ -785,12 +575,6 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_local_size_z: return LowerImplicitParameter(DAG, VT, DL, 8); - case Intrinsic::r600_read_workdim: - case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name. - uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); - return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); - } - case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, AMDGPU::T1_X, VT); @@ -836,9 +620,10 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG)); return; } - // Fall-through. Since we don't care about out of bounds values - // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint - // considers some extra cases which are not necessary here. + // Since we don't care about out of bounds values we can use FP_TO_SINT for + // uints too. The DAGLegalizer code for uint considers some extra cases + // which are not necessary here. + LLVM_FALLTHROUGH; case ISD::FP_TO_SINT: { if (N->getValueType(0) == MVT::i1) { Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG)); @@ -867,14 +652,12 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const { - SDLoc DL(Vector); EVT VecVT = Vector.getValueType(); EVT EltVT = VecVT.getVectorElementType(); SmallVector<SDValue, 8> Args; - for (unsigned i = 0, e = VecVT.getVectorNumElements(); - i != e; ++i) { + for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) { Args.push_back(DAG.getNode( ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout())))); @@ -885,7 +668,6 @@ SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); SDValue Vector = Op.getOperand(0); SDValue Index = Op.getOperand(1); @@ -919,7 +701,6 @@ SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const { - GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); @@ -1318,90 +1099,158 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth, SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const { SDLoc DL(Store); + //TODO: Who creates the i8 stores? + assert(Store->isTruncatingStore() + || Store->getValue().getValueType() == MVT::i8); + assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS); - unsigned Mask = 0; + SDValue Mask; if (Store->getMemoryVT() == MVT::i8) { - Mask = 0xff; + assert(Store->getAlignment() >= 1); + Mask = DAG.getConstant(0xff, DL, MVT::i32); } else if (Store->getMemoryVT() == MVT::i16) { - Mask = 0xffff; + assert(Store->getAlignment() >= 2); + Mask = DAG.getConstant(0xffff, DL, MVT::i32);; + } else { + llvm_unreachable("Unsupported private trunc store"); } - SDValue Chain = Store->getChain(); + SDValue OldChain = Store->getChain(); + bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN); + // Skip dummy + SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain; SDValue BasePtr = Store->getBasePtr(); + SDValue Offset = Store->getOffset(); EVT MemVT = Store->getMemoryVT(); - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, - DAG.getConstant(2, DL, MVT::i32)); - SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); + SDValue LoadPtr = BasePtr; + if (!Offset.isUndef()) { + LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); + } + + // Get dword location + // TODO: this should be eliminated by the future SHR ptr, 2 + SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, + DAG.getConstant(0xfffffffc, DL, MVT::i32)); + + // Load dword + // TODO: can we be smarter about machine pointer info? + SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo()); + + Chain = Dst.getValue(1); - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + // Get offset in dword + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, DAG.getConstant(0x3, DL, MVT::i32)); + // Convert byte offset to bit shift SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, DAG.getConstant(3, DL, MVT::i32)); + // TODO: Contrary to the name of the functiom, + // it also handles sub i32 non-truncating stores (like i1) SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Store->getValue()); + // Mask the value to the right type SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + // Shift the value in place SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, MaskedValue, ShiftAmt); - SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, - DAG.getConstant(Mask, DL, MVT::i32), - ShiftAmt); - DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, - DAG.getConstant(0xffffffff, DL, MVT::i32)); + // Shift the mask in place + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt); + + // Invert the mask. NOTE: if we had native ROL instructions we could + // use inverted mask + DstMask = DAG.getNOT(DL, DstMask, MVT::i32); + + // Cleanup the target bits Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + // Add the new bits SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); - return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); + + // Store dword + // TODO: Can we be smarter about MachinePointerInfo? + SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo()); + + // If we are part of expanded vector, make our neighbors depend on this store + if (VectorTrunc) { + // Make all other vector elements depend on this store + Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore); + DAG.ReplaceAllUsesOfValueWith(OldChain, Chain); + } + return NewStore; } SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG)) - return Result; - StoreSDNode *StoreNode = cast<StoreSDNode>(Op); unsigned AS = StoreNode->getAddressSpace(); + + SDValue Chain = StoreNode->getChain(); + SDValue Ptr = StoreNode->getBasePtr(); SDValue Value = StoreNode->getValue(); - EVT ValueVT = Value.getValueType(); + EVT VT = Value.getValueType(); + EVT MemVT = StoreNode->getMemoryVT(); + EVT PtrVT = Ptr.getValueType(); + + SDLoc DL(Op); + + // Neither LOCAL nor PRIVATE can do vectors at the moment if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && - ValueVT.isVector()) { - return SplitVectorStore(Op, DAG); + VT.isVector()) { + if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && StoreNode->isTruncatingStore()) { + // Add an extra level of chain to isolate this vector + SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); + // TODO: can the chain be replaced without creating a new store? + SDValue NewStore = DAG.getTruncStore( + NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), + MemVT, StoreNode->getAlignment(), + StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo()); + StoreNode = cast<StoreSDNode>(NewStore); + } + + return scalarizeVectorStore(StoreNode, DAG); } - SDLoc DL(Op); - SDValue Chain = StoreNode->getChain(); - SDValue Ptr = StoreNode->getBasePtr(); + unsigned Align = StoreNode->getAlignment(); + if (Align < MemVT.getStoreSize() && + !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) { + return expandUnalignedStore(StoreNode, DAG); + } + + SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr, + DAG.getConstant(2, DL, PtrVT)); if (AS == AMDGPUAS::GLOBAL_ADDRESS) { + // It is beneficial to create MSKOR here instead of combiner to avoid + // artificial dependencies introduced by RMW if (StoreNode->isTruncatingStore()) { - EVT VT = Value.getValueType(); assert(VT.bitsLE(MVT::i32)); - EVT MemVT = StoreNode->getMemoryVT(); SDValue MaskConstant; if (MemVT == MVT::i8) { MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); } else { assert(MemVT == MVT::i16); + assert(StoreNode->getAlignment() >= 2); MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); } - SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, - DAG.getConstant(2, DL, MVT::i32)); - SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(0x00000003, DL, VT)); + + SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr, + DAG.getConstant(0x00000003, DL, PtrVT)); + SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, + DAG.getConstant(3, DL, VT)); + + // Put the mask in correct place + SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift); + + // Put the value bits in correct place SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); - SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, - DAG.getConstant(3, DL, VT)); - SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); - SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift); + // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 // vector instead. SDValue Src[4] = { @@ -1415,12 +1264,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, Op->getVTList(), Args, MemVT, StoreNode->getMemOperand()); - } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && - ValueVT.bitsGE(MVT::i32)) { + } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) { // Convert pointer from byte address to dword address. - Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), - DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), - Ptr, DAG.getConstant(2, DL, MVT::i32))); + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { llvm_unreachable("Truncated and indexed stores not supported yet"); @@ -1431,50 +1277,22 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } } + // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes if (AS != AMDGPUAS::PRIVATE_ADDRESS) return SDValue(); - EVT MemVT = StoreNode->getMemoryVT(); if (MemVT.bitsLT(MVT::i32)) return lowerPrivateTruncStore(StoreNode, DAG); - // Lowering for indirect addressing - const MachineFunction &MF = DAG.getMachineFunction(); - const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); - unsigned StackWidth = TFL->getStackWidth(MF); - - Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); - - if (ValueVT.isVector()) { - unsigned NumElemVT = ValueVT.getVectorNumElements(); - EVT ElemVT = ValueVT.getVectorElementType(); - SmallVector<SDValue, 4> Stores(NumElemVT); - - assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " - "vector width in load"); - - for (unsigned i = 0; i < NumElemVT; ++i) { - unsigned Channel, PtrIncr; - getStackAddress(StackWidth, i, Channel, PtrIncr); - Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(PtrIncr, DL, MVT::i32)); - SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, - Value, DAG.getConstant(i, DL, MVT::i32)); - - Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Elem, Ptr, - DAG.getTargetConstant(Channel, DL, MVT::i32)); - } - Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); - } else { - if (ValueVT == MVT::i8) { - Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); - } - Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); // Channel + // Standard i32+ store, tag it with DWORDADDR to note that the address + // has been shifted + if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); + return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); } - return Chain; + // Tagged i32+ stores will be matched by patterns + return SDValue(); } // return (512 + (kc_bank << 12) @@ -1524,51 +1342,50 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, LoadSDNode *Load = cast<LoadSDNode>(Op); ISD::LoadExtType ExtType = Load->getExtensionType(); EVT MemVT = Load->getMemoryVT(); + assert(Load->getAlignment() >= MemVT.getStoreSize()); + + SDValue BasePtr = Load->getBasePtr(); + SDValue Chain = Load->getChain(); + SDValue Offset = Load->getOffset(); + + SDValue LoadPtr = BasePtr; + if (!Offset.isUndef()) { + LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); + } - // <SI && AS=PRIVATE && EXTLOAD && size < 32bit, - // register (2-)byte extract. + // Get dword location + // NOTE: this should be eliminated by the future SHR ptr, 2 + SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, + DAG.getConstant(0xfffffffc, DL, MVT::i32)); - // Get Register holding the target. - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), - DAG.getConstant(2, DL, MVT::i32)); - // Load the Register. - SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), - Load->getChain(), - Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), - Op.getOperand(2)); + // Load dword + // TODO: can we be smarter about machine pointer info? + SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo()); // Get offset within the register. SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, - Load->getBasePtr(), - DAG.getConstant(0x3, DL, MVT::i32)); + LoadPtr, DAG.getConstant(0x3, DL, MVT::i32)); // Bit offset of target byte (byteIdx * 8). SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, DAG.getConstant(3, DL, MVT::i32)); // Shift to the right. - Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt); // Eliminate the upper bits by setting them to ... EVT MemEltVT = MemVT.getScalarType(); - // ... ones. - if (ExtType == ISD::SEXTLOAD) { + if (ExtType == ISD::SEXTLOAD) { // ... ones. SDValue MemEltVTNode = DAG.getValueType(MemEltVT); - - SDValue Ops[] = { - DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), - Load->getChain() - }; - - return DAG.getMergeValues(Ops, DL); + Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); + } else { // ... or zeros. + Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT); } - // ... or zeros. SDValue Ops[] = { - DAG.getZeroExtendInReg(Ret, DL, MemEltVT), - Load->getChain() + Ret, + Read.getValue(1) // This should be our output chain }; return DAG.getMergeValues(Ops, DL); @@ -1590,12 +1407,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = LoadNode->getChain(); SDValue Ptr = LoadNode->getBasePtr(); - if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { - SDValue MergedValues[2] = { - scalarizeVectorLoad(LoadNode, DAG), - Chain - }; - return DAG.getMergeValues(MergedValues, DL); + if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && + VT.isVector()) { + return scalarizeVectorLoad(LoadNode, DAG); } int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); @@ -1646,8 +1461,6 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(MergedValues, DL); } - SDValue LoweredLoad; - // For most operations returning SDValue() will result in the node being // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we // need to manually expand loads that may be legal in some address spaces and @@ -1672,47 +1485,14 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } - // Lowering for indirect addressing - const MachineFunction &MF = DAG.getMachineFunction(); - const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); - unsigned StackWidth = TFL->getStackWidth(MF); - - Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); - - if (VT.isVector()) { - unsigned NumElemVT = VT.getVectorNumElements(); - EVT ElemVT = VT.getVectorElementType(); - SDValue Loads[4]; - - assert(NumElemVT <= 4); - assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " - "vector width in load"); - - for (unsigned i = 0; i < NumElemVT; ++i) { - unsigned Channel, PtrIncr; - getStackAddress(StackWidth, i, Channel, PtrIncr); - Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(PtrIncr, DL, MVT::i32)); - Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, - Chain, Ptr, - DAG.getTargetConstant(Channel, DL, MVT::i32), - Op.getOperand(2)); - } - EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT); - LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT)); - } else { - LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), // Channel - Op.getOperand(2)); + // DWORDADDR ISD marks already shifted address + if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { + assert(VT == MVT::i32); + Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32)); + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr); + return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand()); } - - SDValue Ops[2] = { - LoweredLoad, - Chain - }; - - return DAG.getMergeValues(Ops, DL); + return SDValue(); } SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { @@ -1754,9 +1534,11 @@ SDValue R600TargetLowering::LowerFormalArguments( SmallVector<ISD::InputArg, 8> LocalIns; - getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); - - AnalyzeFormalArguments(CCInfo, LocalIns); + if (AMDGPU::isShader(CallConv)) { + AnalyzeFormalArguments(CCInfo, Ins); + } else { + analyzeFormalArgumentsCompute(CCInfo, Ins); + } for (unsigned i = 0, e = Ins.size(); i < e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -1800,18 +1582,19 @@ SDValue R600TargetLowering::LowerFormalArguments( unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); unsigned PartOffset = VA.getLocMemOffset(); - unsigned Offset = 36 + VA.getLocMemOffset(); + unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset(); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); SDValue Arg = DAG.getLoad( ISD::UNINDEXED, Ext, VT, DL, Chain, DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo, - MemVT, /* Alignment = */ 4, - MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant); + MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); // 4 is the preferred alignment for the CONSTANT memory space. InVals.push_back(Arg); - MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); + MFI->setABIArgOffset(Offset + MemVT.getStoreSize()); } return Chain; } @@ -1949,7 +1732,6 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], return BuildVector; } - //===----------------------------------------------------------------------===// // Custom DAG Optimizations //===----------------------------------------------------------------------===// @@ -1957,14 +1739,14 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); switch (N->getOpcode()) { - default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) case ISD::FP_ROUND: { SDValue Arg = N->getOperand(0); if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { - return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), + return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0), Arg.getOperand(0)); } break; @@ -1989,12 +1771,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } - SDLoc dl(N); - return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0), + return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0), SelectCC.getOperand(0), // LHS SelectCC.getOperand(1), // RHS - DAG.getConstant(-1, dl, MVT::i32), // True - DAG.getConstant(0, dl, MVT::i32), // False + DAG.getConstant(-1, DL, MVT::i32), // True + DAG.getConstant(0, DL, MVT::i32), // False SelectCC.getOperand(4)); // CC break; @@ -2006,7 +1787,6 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); SDValue EltNo = N->getOperand(2); - SDLoc dl(N); // If the inserted element is an UNDEF, just use the input vector. if (InVal.isUndef()) @@ -2044,13 +1824,13 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, EVT OpVT = Ops[0].getValueType(); if (InVal.getValueType() != OpVT) InVal = OpVT.bitsGT(InVal.getValueType()) ? - DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : - DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); + DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) : + DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal); Ops[Elt] = InVal; } // Return the new vector - return DAG.getBuildVector(VT, dl, Ops); + return DAG.getBuildVector(VT, DL, Ops); } // Extract_vec (Build_vector) generated by custom lowering @@ -2064,11 +1844,13 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } } if (Arg.getOpcode() == ISD::BITCAST && - Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && + (Arg.getOperand(0).getValueType().getVectorNumElements() == + Arg.getValueType().getVectorNumElements())) { if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { unsigned Element = Const->getZExtValue(); - return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), - Arg->getOperand(0).getOperand(Element)); + return DAG.getNode(ISD::BITCAST, DL, N->getVTList(), + Arg->getOperand(0).getOperand(Element)); } } break; @@ -2109,7 +1891,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, LHS.getOperand(0).getValueType().isInteger()); if (DCI.isBeforeLegalizeOps() || isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) - return DAG.getSelectCC(SDLoc(N), + return DAG.getSelectCC(DL, LHS.getOperand(0), LHS.getOperand(1), LHS.getOperand(2), @@ -2121,7 +1903,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } - case AMDGPUISD::EXPORT: { + case AMDGPUISD::R600_EXPORT: { SDValue Arg = N->getOperand(1); if (Arg.getOpcode() != ISD::BUILD_VECTOR) break; @@ -2136,9 +1918,8 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, N->getOperand(6), // SWZ_Z N->getOperand(7) // SWZ_W }; - SDLoc DL(N); NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); - return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); + return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs); } case AMDGPUISD::TEXTURE_FETCH: { SDValue Arg = N->getOperand(1); @@ -2166,10 +1947,10 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, N->getOperand(17), N->getOperand(18), }; - SDLoc DL(N); NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); } + default: break; } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); @@ -2262,7 +2043,6 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, unsigned ImmReg = AMDGPU::ALU_LITERAL_X; uint64_t ImmValue = 0; - if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); float FloatValue = FPC->getValueAPF().convertToFloat(); diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td index 0ffd485..68fcc54 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td @@ -210,14 +210,14 @@ class VTX_WORD0 { bits<5> VC_INST; bits<2> FETCH_TYPE; bits<1> FETCH_WHOLE_QUAD; - bits<8> BUFFER_ID; + bits<8> buffer_id; bits<1> SRC_REL; bits<2> SRC_SEL_X; let Word0{4-0} = VC_INST; let Word0{6-5} = FETCH_TYPE; let Word0{7} = FETCH_WHOLE_QUAD; - let Word0{15-8} = BUFFER_ID; + let Word0{15-8} = buffer_id; let Word0{22-16} = src_gpr; let Word0{23} = SRC_REL; let Word0{25-24} = SRC_SEL_X; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 1c5f7ec..e88bd07 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -320,12 +320,12 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const { ConstCount = 0; - ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI); const std::pair<int, unsigned> DummyPair(-1, 0); std::vector<std::pair<int, unsigned> > Result; unsigned i = 0; - for (unsigned n = Srcs.size(); i < n; ++i) { - unsigned Reg = Srcs[i].first->getReg(); + for (const auto &Src : getSrcs(MI)) { + ++i; + unsigned Reg = Src.first->getReg(); int Index = RI.getEncodingValue(Reg) & 0xff; if (Reg == AMDGPU::OQAP) { Result.push_back(std::make_pair(Index, 0U)); @@ -592,9 +592,7 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs) if (!isALUInstr(MI.getOpcode())) continue; - ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI); - - for (const auto &Src:Srcs) { + for (const auto &Src : getSrcs(MI)) { if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X) Literals.insert(Src.second); if (Literals.size() > 4) @@ -667,7 +665,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, // handled if (isBranch(I->getOpcode())) return true; - if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) { + if (!isJump(I->getOpcode())) { return false; } @@ -682,8 +680,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, // If there is only one terminator instruction, process it. unsigned LastOpc = LastInst.getOpcode(); - if (I == MBB.begin() || - !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) { + if (I == MBB.begin() || !isJump((--I)->getOpcode())) { if (LastOpc == AMDGPU::JUMP) { TBB = LastInst.getOperand(0).getMBB(); return false; @@ -729,17 +726,19 @@ MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { It != E; ++It) { if (It->getOpcode() == AMDGPU::CF_ALU || It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) - return std::prev(It.base()); + return It.getReverse(); } return MBB.end(); } -unsigned R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, +unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, - const DebugLoc &DL) const { - assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + const DebugLoc &DL, + int *BytesAdded) const { + assert(TBB && "insertBranch must not be told to insert a fallthrough"); + assert(!BytesAdded && "code size not handled"); if (!FBB) { if (Cond.empty()) { @@ -779,8 +778,9 @@ unsigned R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, } } -unsigned -R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { +unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB, + int *BytesRemoved) const { + assert(!BytesRemoved && "code size not handled"); // Note : we leave PRED* instructions there. // They may be needed when predicating instructions. @@ -910,20 +910,20 @@ R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, bool -R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { +R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { MachineOperand &MO = Cond[1]; switch (MO.getImm()) { - case OPCODE_IS_ZERO_INT: - MO.setImm(OPCODE_IS_NOT_ZERO_INT); + case AMDGPU::PRED_SETE_INT: + MO.setImm(AMDGPU::PRED_SETNE_INT); break; - case OPCODE_IS_NOT_ZERO_INT: - MO.setImm(OPCODE_IS_ZERO_INT); + case AMDGPU::PRED_SETNE_INT: + MO.setImm(AMDGPU::PRED_SETE_INT); break; - case OPCODE_IS_ZERO: - MO.setImm(OPCODE_IS_NOT_ZERO); + case AMDGPU::PRED_SETE: + MO.setImm(AMDGPU::PRED_SETNE); break; - case OPCODE_IS_NOT_ZERO: - MO.setImm(OPCODE_IS_ZERO); + case AMDGPU::PRED_SETNE: + MO.setImm(AMDGPU::PRED_SETE); break; default: return true; @@ -1160,10 +1160,10 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); - const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); int Offset = -1; - if (MFI->getNumObjects() == 0) { + if (MFI.getNumObjects() == 0) { return -1; } @@ -1195,14 +1195,14 @@ int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { int Offset = 0; - const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); // Variable sized objects are not supported - if (MFI->hasVarSizedObjects()) { + if (MFI.hasVarSizedObjects()) { return -1; } - if (MFI->getNumObjects() == 0) { + if (MFI.getNumObjects() == 0) { return -1; } @@ -1481,11 +1481,3 @@ void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand, FlagOp.setImm(InstFlags); } } - -bool R600InstrInfo::isRegisterStore(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE; -} - -bool R600InstrInfo::isRegisterLoad(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD; -} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h index feaca98..a280052 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h @@ -19,6 +19,14 @@ #include "R600RegisterInfo.h" namespace llvm { + +namespace R600InstrFlags { +enum : uint64_t { + REGISTER_STORE = UINT64_C(1) << 62, + REGISTER_LOAD = UINT64_C(1) << 63 +}; +} + class AMDGPUTargetMachine; class DFAPacketizer; class MachineFunction; @@ -151,7 +159,7 @@ public: DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const override; - bool ReverseBranchCondition( + bool reverseBranchCondition( SmallVectorImpl<MachineOperand> &Cond) const override; bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, @@ -159,11 +167,13 @@ public: SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; - unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, - const DebugLoc &DL) const override; + const DebugLoc &DL, + int *BytesAdded = nullptr) const override; - unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + unsigned removeBranch(MachineBasicBlock &MBB, + int *BytesRemvoed = nullptr) const override; bool isPredicated(const MachineInstr &MI) const override; @@ -301,8 +311,13 @@ public: void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const; // Helper functions that check the opcode for status information - bool isRegisterStore(const MachineInstr &MI) const; - bool isRegisterLoad(const MachineInstr &MI) const; + bool isRegisterStore(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_STORE; + } + + bool isRegisterLoad(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_LOAD; + } }; namespace AMDGPU { diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td index b6b576d..9210e66 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -242,20 +242,6 @@ def TEX_SHADOW_ARRAY : PatLeaf< }] >; -def TEX_MSAA : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 14; - }] ->; - -def TEX_ARRAY_MSAA : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 15; - }] ->; - class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask, dag outs, dag ins, string asm, list<dag> pattern> : InstR600ISA <outs, ins, asm, pattern>, @@ -283,8 +269,8 @@ class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask, } -class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern> - : InstR600ISA <outs, (ins MEMxi:$src_gpr), !strconcat(" ", name), pattern>, +class VTX_READ <string name, dag outs, list<dag> pattern> + : InstR600ISA <outs, (ins MEMxi:$src_gpr, i8imm:$buffer_id), !strconcat(" ", name, ", #$buffer_id"), pattern>, VTX_WORD1_GPR { // Static fields @@ -333,9 +319,9 @@ class LoadParamFrag <PatFrag load_type> : PatFrag < (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }] >; -def load_param : LoadParamFrag<load>; -def load_param_exti8 : LoadParamFrag<az_extloadi8>; -def load_param_exti16 : LoadParamFrag<az_extloadi16>; +def vtx_id3_az_extloadi8 : LoadParamFrag<az_extloadi8>; +def vtx_id3_az_extloadi16 : LoadParamFrag<az_extloadi16>; +def vtx_id3_load : LoadParamFrag<load>; class LoadVtxId1 <PatFrag load> : PatFrag < (ops node:$ptr), (load node:$ptr), [{ @@ -450,11 +436,6 @@ def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>; // Export Instructions //===----------------------------------------------------------------------===// -def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>; - -def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType, - [SDNPHasChain, SDNPSideEffect]>; - class ExportWord0 { field bits<32> Word0; @@ -500,7 +481,7 @@ class ExportBufWord1 { } multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> { - def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), + def : Pat<(R600_EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)), (ExportInst R600_Reg128:$src, imm:$type, imm:$base, imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0) @@ -746,6 +727,20 @@ def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>; def MOV : R600_1OP <0x19, "MOV", []>; + +// This is a hack to get rid of DUMMY_CHAIN nodes. +// Most DUMMY_CHAINs should be eliminated during legalization, but undef +// values can sneak in some to selection. +let isPseudo = 1, isCodeGenOnly = 1 in { +def DUMMY_CHAIN : AMDGPUInst < + (outs), + (ins), + "DUMMY_CHAIN", + [(R600dummy_chain)] +>; +} // end let isPseudo = 1, isCodeGenOnly = 1 + + let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst < @@ -1073,18 +1068,27 @@ class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>; class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>; class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>; class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper < - inst, "MULHI_INT", mulhs -> { + inst, "MULHI_INT", mulhs> { let Itinerary = TransALU; } + +class MULHI_INT24_Common <bits<11> inst> : R600_2OP_Helper < + inst, "MULHI_INT24", AMDGPUmulhi_i24> { + let Itinerary = VecALU; +} + class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper < - inst, "MULHI", mulhu -> { + inst, "MULHI", mulhu> { let Itinerary = TransALU; } + +class MULHI_UINT24_Common <bits<11> inst> : R600_2OP_Helper < + inst, "MULHI_UINT24", AMDGPUmulhi_u24> { + let Itinerary = VecALU; +} + class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper < - inst, "MULLO_INT", mul -> { + inst, "MULLO_INT", mul> { let Itinerary = TransALU; } class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> { @@ -1278,6 +1282,17 @@ let Predicates = [isR600] in { defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>; +// Hardcode channel to 0 +// NOTE: LSHR is not available here. LSHR is per family instruction +def : Pat < + (i32 (load_private ADDRIndirect:$addr) ), + (R600_RegisterLoad FRAMEri:$addr, (i32 0)) +>; +def : Pat < + (store_private i32:$val, ADDRIndirect:$addr), + (R600_RegisterStore i32:$val, FRAMEri:$addr, (i32 0)) +>; + //===----------------------------------------------------------------------===// // Pseudo instructions @@ -1366,8 +1381,8 @@ def CONST_COPY : Instruction { } // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" def TEX_VTX_CONSTBUF : - InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr", - [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>, + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$buffer_id), "VTX_READ_eg $dst, $ptr", + [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$buffer_id)))]>, VTX_WORD1_GPR, VTX_WORD0_eg { let VC_INST = 0; @@ -1420,7 +1435,7 @@ def TEX_VTX_CONSTBUF : } def TEX_VTX_TEXBUF: - InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr">, + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$buffer_id), "TEX_VTX_EXPLICIT_READ $dst, $ptr">, VTX_WORD1_GPR, VTX_WORD0_eg { let VC_INST = 0; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp index 01105c6..3ca319c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp @@ -12,9 +12,5 @@ using namespace llvm; - -// Pin the vtable to this file. -void R600MachineFunctionInfo::anchor() {} - R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF) { } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h index 04a4436..29ac092 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h @@ -14,18 +14,13 @@ #define LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H #include "AMDGPUMachineFunction.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include <vector> namespace llvm { class R600MachineFunctionInfo final : public AMDGPUMachineFunction { - void anchor() override; public: R600MachineFunctionInfo(const MachineFunction &MF); - SmallVector<unsigned, 4> LiveOuts; - std::vector<unsigned> IndirectRegs; - unsigned StackSize; + unsigned CFStackSize; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h index 16d5d93..9a67705 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h @@ -16,6 +16,7 @@ #define LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H #include "llvm/CodeGen/MachineScheduler.h" +#include <vector> using namespace llvm; @@ -25,10 +26,10 @@ class R600InstrInfo; struct R600RegisterInfo; class R600SchedStrategy final : public MachineSchedStrategy { - const ScheduleDAGMILive *DAG; - const R600InstrInfo *TII; - const R600RegisterInfo *TRI; - MachineRegisterInfo *MRI; + const ScheduleDAGMILive *DAG = nullptr; + const R600InstrInfo *TII = nullptr; + const R600RegisterInfo *TRI = nullptr; + MachineRegisterInfo *MRI = nullptr; enum InstKind { IDAlu, @@ -66,11 +67,8 @@ class R600SchedStrategy final : public MachineSchedStrategy { int OccupedSlotsMask; public: - R600SchedStrategy() : - DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) { - } - - virtual ~R600SchedStrategy() {} + R600SchedStrategy() = default; + ~R600SchedStrategy() override = default; void initialize(ScheduleDAGMI *dag) override; SUnit *pickNode(bool &IsTopNode) override; @@ -97,6 +95,6 @@ private: void MoveUnits(std::vector<SUnit *> &QSrc, std::vector<SUnit *> &QDst); }; -} // namespace llvm +} // end namespace llvm -#endif /* R600MACHINESCHEDULER_H_ */ +#endif // LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index ecae27d..d90008a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -31,22 +31,31 @@ #include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" -#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/PassAnalysisSupport.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <utility> +#include <vector> using namespace llvm; #define DEBUG_TYPE "vec-merger" -namespace { - static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg), @@ -60,11 +69,14 @@ isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { return false; } +namespace { + class RegSeqInfo { public: MachineInstr *Instr; DenseMap<unsigned, unsigned> RegToChan; std::vector<unsigned> UndefReg; + RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) { assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE); for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) { @@ -76,7 +88,8 @@ public: RegToChan[MO.getReg()] = Chan; } } - RegSeqInfo() {} + + RegSeqInfo() = default; bool operator==(const RegSeqInfo &RSI) const { return RSI.Instr == Instr; @@ -87,28 +100,30 @@ class R600VectorRegMerger : public MachineFunctionPass { private: MachineRegisterInfo *MRI; const R600InstrInfo *TII; - bool canSwizzle(const MachineInstr &) const; + + bool canSwizzle(const MachineInstr &MI) const; bool areAllUsesSwizzeable(unsigned Reg) const; void SwizzleInput(MachineInstr &, - const std::vector<std::pair<unsigned, unsigned> > &) const; - bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *, - std::vector<std::pair<unsigned, unsigned> > &Remap) const; + const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const; + bool tryMergeVector(const RegSeqInfo *Untouched, RegSeqInfo *ToMerge, + std::vector<std::pair<unsigned, unsigned>> &Remap) const; bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, - std::vector<std::pair<unsigned, unsigned> > &RemapChan); + std::vector<std::pair<unsigned, unsigned>> &RemapChan); bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, - std::vector<std::pair<unsigned, unsigned> > &RemapChan); - MachineInstr *RebuildVector(RegSeqInfo *MI, - const RegSeqInfo *BaseVec, - const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const; + std::vector<std::pair<unsigned, unsigned>> &RemapChan); + MachineInstr *RebuildVector(RegSeqInfo *MI, const RegSeqInfo *BaseVec, + const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const; void RemoveMI(MachineInstr *); void trackRSI(const RegSeqInfo &RSI); - typedef DenseMap<unsigned, std::vector<MachineInstr *> > InstructionSetMap; + typedef DenseMap<unsigned, std::vector<MachineInstr *>> InstructionSetMap; DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq; InstructionSetMap PreviousRegSeqByReg; InstructionSetMap PreviousRegSeqByUndefCount; + public: static char ID; + R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID), TII(nullptr) { } @@ -121,13 +136,15 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } - const char *getPassName() const override { + StringRef getPassName() const override { return "R600 Vector Registers Merge Pass"; } bool runOnMachineFunction(MachineFunction &Fn) override; }; +} // end anonymous namespace. + char R600VectorRegMerger::ID = 0; bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI) @@ -144,7 +161,7 @@ bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI) } bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched, - RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned> > &Remap) + RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned>> &Remap) const { unsigned CurrentUndexIdx = 0; for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(), @@ -167,7 +184,7 @@ bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched, static unsigned getReassignedChan( - const std::vector<std::pair<unsigned, unsigned> > &RemapChan, + const std::vector<std::pair<unsigned, unsigned>> &RemapChan, unsigned Chan) { for (unsigned j = 0, je = RemapChan.size(); j < je; j++) { if (RemapChan[j].first == Chan) @@ -178,7 +195,7 @@ unsigned getReassignedChan( MachineInstr *R600VectorRegMerger::RebuildVector( RegSeqInfo *RSI, const RegSeqInfo *BaseRSI, - const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const { + const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const { unsigned Reg = RSI->Instr->getOperand(0).getReg(); MachineBasicBlock::iterator Pos = RSI->Instr; MachineBasicBlock &MBB = *Pos->getParent(); @@ -200,12 +217,10 @@ MachineInstr *R600VectorRegMerger::RebuildVector( .addReg(SubReg) .addImm(Chan); UpdatedRegToChan[SubReg] = Chan; - std::vector<unsigned>::iterator ChanPos = - std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan); + std::vector<unsigned>::iterator ChanPos = llvm::find(UpdatedUndef, Chan); if (ChanPos != UpdatedUndef.end()) UpdatedUndef.erase(ChanPos); - assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) == - UpdatedUndef.end() && + assert(!is_contained(UpdatedUndef, Chan) && "UpdatedUndef shouldn't contain Chan more than once!"); DEBUG(dbgs() << " ->"; Tmp->dump();); (void)Tmp; @@ -236,17 +251,17 @@ void R600VectorRegMerger::RemoveMI(MachineInstr *MI) { for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(), E = PreviousRegSeqByReg.end(); It != E; ++It) { std::vector<MachineInstr *> &MIs = (*It).second; - MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end()); + MIs.erase(llvm::find(MIs, MI), MIs.end()); } for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(), E = PreviousRegSeqByUndefCount.end(); It != E; ++It) { std::vector<MachineInstr *> &MIs = (*It).second; - MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end()); + MIs.erase(llvm::find(MIs, MI), MIs.end()); } } void R600VectorRegMerger::SwizzleInput(MachineInstr &MI, - const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const { + const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const { unsigned Offset; if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) Offset = 2; @@ -274,7 +289,7 @@ bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const { bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, - std::vector<std::pair<unsigned, unsigned> > &RemapChan) { + std::vector<std::pair<unsigned, unsigned>> &RemapChan) { for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(), MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) { if (!MOp->isReg()) @@ -294,7 +309,7 @@ bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI, bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, - std::vector<std::pair<unsigned, unsigned> > &RemapChan) { + std::vector<std::pair<unsigned, unsigned>> &RemapChan) { unsigned NeededUndefs = 4 - RSI.UndefReg.size(); if (PreviousRegSeqByUndefCount[NeededUndefs].empty()) return false; @@ -357,7 +372,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { }); RegSeqInfo CandidateRSI; - std::vector<std::pair<unsigned, unsigned> > RemapChan; + std::vector<std::pair<unsigned, unsigned>> RemapChan; DEBUG(dbgs() << "Using common slots...\n";); if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) { // Remove CandidateRSI mapping @@ -381,8 +396,6 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { return false; } -} - llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) { return new R600VectorRegMerger(tm); } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index c848664..5b6dd1e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -47,9 +47,7 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } - const char *getPassName() const override { - return "R600 Packetizer"; - } + StringRef getPassName() const override { return "R600 Packetizer"; } bool runOnMachineFunction(MachineFunction &Fn) override; }; @@ -283,7 +281,7 @@ public: return false; } - // We cannot read LDS source registrs from the Trans slot. + // We cannot read LDS source registers from the Trans slot. if (isTransSlot && TII->readsLDSSrcReg(MI)) return false; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 5f182c5..d70f52e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -102,9 +102,7 @@ public: bool runOnFunction(Function &F) override; - const char *getPassName() const override { - return "SI annotate control flow"; - } + StringRef getPassName() const override { return "SI annotate control flow"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LoopInfoWrapperPass>(); @@ -148,12 +146,15 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) { Break = M.getOrInsertFunction( BreakIntrinsic, Int64, Int64, (Type *)nullptr); + cast<Function>(Break)->setDoesNotAccessMemory(); IfBreak = M.getOrInsertFunction( IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr); + cast<Function>(IfBreak)->setDoesNotAccessMemory();; ElseBreak = M.getOrInsertFunction( ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr); + cast<Function>(ElseBreak)->setDoesNotAccessMemory(); Loop = M.getOrInsertFunction( LoopIntrinsic, Boolean, Int64, (Type *)nullptr); @@ -331,6 +332,8 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { BasicBlock *BB = Term->getParent(); llvm::Loop *L = LI->getLoopFor(BB); + if (!L) + return; BasicBlock *Target = Term->getSuccessor(1); PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); @@ -361,7 +364,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { std::vector<BasicBlock*> Preds; for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { - if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end()) + if (!is_contained(Latches, *PI)) Preds.push_back(*PI); } BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp index 65ceff3..62ebef8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp @@ -38,7 +38,7 @@ public: static char ID; SIDebuggerInsertNops() : MachineFunctionPass(ID) { } - const char *getPassName() const override { return PASS_NAME; } + StringRef getPassName() const override { return PASS_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h index f4b04e3..ff4e321 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h @@ -13,76 +13,111 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H #define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H +namespace llvm { + namespace SIInstrFlags { // This needs to be kept in sync with the field bits in InstSI. -enum { - SALU = 1 << 3, - VALU = 1 << 4, - - SOP1 = 1 << 5, - SOP2 = 1 << 6, - SOPC = 1 << 7, - SOPK = 1 << 8, - SOPP = 1 << 9, - - VOP1 = 1 << 10, - VOP2 = 1 << 11, - VOP3 = 1 << 12, - VOPC = 1 << 13, +enum : uint64_t { + // Low bits - basic encoding information. + SALU = 1 << 0, + VALU = 1 << 1, + + // SALU instruction formats. + SOP1 = 1 << 2, + SOP2 = 1 << 3, + SOPC = 1 << 4, + SOPK = 1 << 5, + SOPP = 1 << 6, + + // VALU instruction formats. + VOP1 = 1 << 7, + VOP2 = 1 << 8, + VOPC = 1 << 9, + + // TODO: Should this be spilt into VOP3 a and b? + VOP3 = 1 << 10, + + VINTRP = 1 << 13, SDWA = 1 << 14, DPP = 1 << 15, + // Memory instruction formats. MUBUF = 1 << 16, MTBUF = 1 << 17, SMRD = 1 << 18, - DS = 1 << 19, - MIMG = 1 << 20, + MIMG = 1 << 19, + EXP = 1 << 20, FLAT = 1 << 21, - WQM = 1 << 22, + DS = 1 << 22, + + // Pseudo instruction formats. VGPRSpill = 1 << 23, - VOPAsmPrefer32Bit = 1 << 24, - Gather4 = 1 << 25, - DisableWQM = 1 << 26 + SGPRSpill = 1 << 24, + + // High bits - other information. + VM_CNT = UINT64_C(1) << 32, + EXP_CNT = UINT64_C(1) << 33, + LGKM_CNT = UINT64_C(1) << 34, + + WQM = UINT64_C(1) << 35, + DisableWQM = UINT64_C(1) << 36, + Gather4 = UINT64_C(1) << 37, + SOPK_ZEXT = UINT64_C(1) << 38, + SCALAR_STORE = UINT64_C(1) << 39, + FIXED_SIZE = UINT64_C(1) << 40, + VOPAsmPrefer32Bit = UINT64_C(1) << 41 + +}; + +// v_cmp_class_* etc. use a 10-bit mask for what operation is checked. +// The result is true if any of these tests are true. +enum ClassFlags { + S_NAN = 1 << 0, // Signaling NaN + Q_NAN = 1 << 1, // Quiet NaN + N_INFINITY = 1 << 2, // Negative infinity + N_NORMAL = 1 << 3, // Negative normal + N_SUBNORMAL = 1 << 4, // Negative subnormal + N_ZERO = 1 << 5, // Negative zero + P_ZERO = 1 << 6, // Positive zero + P_SUBNORMAL = 1 << 7, // Positive subnormal + P_NORMAL = 1 << 8, // Positive normal + P_INFINITY = 1 << 9 // Positive infinity }; } -namespace llvm { namespace AMDGPU { enum OperandType { - /// Operand with register or 32-bit immediate - OPERAND_REG_IMM32 = MCOI::OPERAND_FIRST_TARGET, - /// Operand with register or inline constant - OPERAND_REG_INLINE_C, - - /// Operand with 32-bit immediate that uses the constant bus. The standard - /// OPERAND_IMMEDIATE should be used for special immediates such as source - /// modifiers. - OPERAND_KIMM32 - }; -} -} - -namespace SIInstrFlags { - enum Flags { - // First 4 bits are the instruction encoding - VM_CNT = 1 << 0, - EXP_CNT = 1 << 1, - LGKM_CNT = 1 << 2 - }; - - // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. - // The result is true if any of these tests are true. - enum ClassFlags { - S_NAN = 1 << 0, // Signaling NaN - Q_NAN = 1 << 1, // Quiet NaN - N_INFINITY = 1 << 2, // Negative infinity - N_NORMAL = 1 << 3, // Negative normal - N_SUBNORMAL = 1 << 4, // Negative subnormal - N_ZERO = 1 << 5, // Negative zero - P_ZERO = 1 << 6, // Positive zero - P_SUBNORMAL = 1 << 7, // Positive subnormal - P_NORMAL = 1 << 8, // Positive normal - P_INFINITY = 1 << 9 // Positive infinity + /// Operands with register or 32-bit immediate + OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET, + OPERAND_REG_IMM_INT64, + OPERAND_REG_IMM_INT16, + OPERAND_REG_IMM_FP32, + OPERAND_REG_IMM_FP64, + OPERAND_REG_IMM_FP16, + + /// Operands with register or inline constant + OPERAND_REG_INLINE_C_INT16, + OPERAND_REG_INLINE_C_INT32, + OPERAND_REG_INLINE_C_INT64, + OPERAND_REG_INLINE_C_FP16, + OPERAND_REG_INLINE_C_FP32, + OPERAND_REG_INLINE_C_FP64, + + OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32, + OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16, + + OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16, + OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64, + + OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32, + OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST, + + // Operand for source modifiers for VOP instructions + OPERAND_INPUT_MODS, + + /// Operand with 32-bit immediate that uses the constant bus. + OPERAND_KIMM32, + OPERAND_KIMM16 }; } @@ -105,7 +140,24 @@ namespace SIOutMods { }; } -namespace llvm { +namespace VGPRIndexMode { + enum { + SRC0_ENABLE = 1 << 0, + SRC1_ENABLE = 1 << 1, + SRC2_ENABLE = 1 << 2, + DST_ENABLE = 1 << 3 + }; +} + +namespace AMDGPUAsmVariants { + enum { + DEFAULT = 0, + VOP3 = 1, + SDWA = 2, + DPP = 3 + }; +} + namespace AMDGPU { namespace EncValues { // Encoding values of enum9/8/7 operands @@ -126,9 +178,7 @@ enum { } // namespace EncValues } // namespace AMDGPU -} // namespace llvm -namespace llvm { namespace AMDGPU { namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns. @@ -184,6 +234,13 @@ namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns. enum Id { // HwRegCode, (6) [5:0] ID_UNKNOWN_ = -1, ID_SYMBOLIC_FIRST_ = 1, // There are corresponding symbolic names defined. + ID_MODE = 1, + ID_STATUS = 2, + ID_TRAPSTS = 3, + ID_HW_ID = 4, + ID_GPR_ALLOC = 5, + ID_LDS_ALLOC = 6, + ID_IB_STS = 7, ID_SYMBOLIC_LAST_ = 8, ID_SHIFT_ = 0, ID_WIDTH_ = 6, @@ -205,8 +262,27 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11] }; } // namespace Hwreg + +namespace SDWA { + +enum SdwaSel { + BYTE_0 = 0, + BYTE_1 = 1, + BYTE_2 = 2, + BYTE_3 = 3, + WORD_0 = 4, + WORD_1 = 5, + DWORD = 6, +}; + +enum DstUnused { + UNUSED_PAD = 0, + UNUSED_SEXT = 1, + UNUSED_PRESERVE = 2, +}; + +} // namespace SDWA } // namespace AMDGPU -} // namespace llvm #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 #define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C @@ -312,4 +388,6 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11] #define R_SPILLED_SGPRS 0x4 #define R_SPILLED_VGPRS 0x8 +} // End namespace llvm + #endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp index 636750d..d4d3959 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp @@ -37,9 +37,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { - return "SI Fix CF Live Intervals"; - } + StringRef getPassName() const override { return "SI Fix CF Live Intervals"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LiveIntervals>(); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 9e0086b..6a422e7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -68,6 +68,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -82,6 +83,9 @@ using namespace llvm; namespace { class SIFixSGPRCopies : public MachineFunctionPass { + + MachineDominatorTree *MDT; + public: static char ID; @@ -89,11 +93,11 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { - return "SI Fix SGPR copies"; - } + StringRef getPassName() const override { return "SI Fix SGPR copies"; } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -101,8 +105,12 @@ public: } // End anonymous namespace -INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE, - "SI Fix SGPR copies", false, false) +INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, + "SI Fix SGPR copies", false, false) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, + "SI Fix SGPR copies", false, false) + char SIFixSGPRCopies::ID = 0; @@ -236,11 +244,94 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, return true; } +static bool phiHasVGPROperands(const MachineInstr &PHI, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *TRI, + const SIInstrInfo *TII) { + + for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { + unsigned Reg = PHI.getOperand(i).getReg(); + if (TRI->hasVGPRs(MRI.getRegClass(Reg))) + return true; + } + return false; +} +static bool phiHasBreakDef(const MachineInstr &PHI, + const MachineRegisterInfo &MRI, + SmallSet<unsigned, 8> &Visited) { + + for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { + unsigned Reg = PHI.getOperand(i).getReg(); + if (Visited.count(Reg)) + continue; + + Visited.insert(Reg); + + MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); + assert(DefInstr); + switch (DefInstr->getOpcode()) { + default: + break; + case AMDGPU::SI_BREAK: + case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_ELSE_BREAK: + return true; + case AMDGPU::PHI: + if (phiHasBreakDef(*DefInstr, MRI, Visited)) + return true; + } + } + return false; +} + +static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB, + const TargetRegisterInfo &TRI) { + for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(), + E = MBB.end(); I != E; ++I) { + if (I->modifiesRegister(AMDGPU::EXEC, &TRI)) + return true; + } + return false; +} + +static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, + const MachineInstr *MoveImm, + const SIInstrInfo *TII, + unsigned &SMovOp, + int64_t &Imm) { + + if (!MoveImm->isMoveImmediate()) + return false; + + const MachineOperand *ImmOp = + TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0); + if (!ImmOp->isImm()) + return false; + + // FIXME: Handle copies with sub-regs. + if (Copy->getOperand(0).getSubReg()) + return false; + + switch (MoveImm->getOpcode()) { + default: + return false; + case AMDGPU::V_MOV_B32_e32: + SMovOp = AMDGPU::S_MOV_B32; + break; + case AMDGPU::V_MOV_B64_PSEUDO: + SMovOp = AMDGPU::S_MOV_B64; + break; + } + Imm = ImmOp->getImm(); + return true; +} + bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); MachineRegisterInfo &MRI = MF.getRegInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); + MDT = &getAnalysis<MachineDominatorTree>(); SmallVector<MachineInstr *, 16> Worklist; @@ -264,18 +355,40 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const TargetRegisterClass *SrcRC, *DstRC; std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { - DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI); + MachineInstr *DefMI = MRI.getVRegDef(MI.getOperand(1).getReg()); + unsigned SMovOp; + int64_t Imm; + // If we are just copying an immediate, we can replace the copy with + // s_mov_b32. + if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) { + MI.getOperand(1).ChangeToImmediate(Imm); + MI.addImplicitDefUseOperands(MF); + MI.setDesc(TII->get(SMovOp)); + break; + } TII->moveToVALU(MI); } break; } case AMDGPU::PHI: { - DEBUG(dbgs() << "Fixing PHI: " << MI); unsigned Reg = MI.getOperand(0).getReg(); if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) break; + // We don't need to fix the PHI if the common dominator of the + // two incoming blocks terminates with a uniform branch. + if (MI.getNumExplicitOperands() == 5) { + MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); + MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); + + MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1); + if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) { + DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n'); + break; + } + } + // If a PHI node defines an SGPR and any of its operands are VGPRs, // then we need to move it to the VALU. // @@ -302,10 +415,6 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { // ... // use sgpr2 // - // FIXME: This is OK if the branching decision is made based on an - // SGPR value. - bool SGPRBranch = false; - // The one exception to this rule is when one of the operands // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK // instruction. In this case, there we know the program will @@ -313,31 +422,12 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { // the first block (where the condition is computed), so there // is no chance for values to be over-written. - bool HasBreakDef = false; - for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { - unsigned Reg = MI.getOperand(i).getReg(); - if (TRI->hasVGPRs(MRI.getRegClass(Reg))) { - TII->moveToVALU(MI); - break; - } - MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); - assert(DefInstr); - switch(DefInstr->getOpcode()) { - - case AMDGPU::SI_BREAK: - case AMDGPU::SI_IF_BREAK: - case AMDGPU::SI_ELSE_BREAK: - // If we see a PHI instruction that defines an SGPR, then that PHI - // instruction has already been considered and should have - // a *_BREAK as an operand. - case AMDGPU::PHI: - HasBreakDef = true; - break; - } - } - - if (!SGPRBranch && !HasBreakDef) + SmallSet<unsigned, 8> Visited; + if (phiHasVGPROperands(MI, MRI, TRI, TII) || + !phiHasBreakDef(MI, MRI, Visited)) { + DEBUG(dbgs() << "Fixing PHI: " << MI); TII->moveToVALU(MI); + } break; } case AMDGPU::REG_SEQUENCE: { diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 4ecc0fc..a5c0d49 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -25,9 +25,55 @@ using namespace llvm; namespace { +struct FoldCandidate { + MachineInstr *UseMI; + union { + MachineOperand *OpToFold; + uint64_t ImmToFold; + int FrameIndexToFold; + }; + unsigned char UseOpNo; + MachineOperand::MachineOperandType Kind; + + FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : + UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) { + if (FoldOp->isImm()) { + ImmToFold = FoldOp->getImm(); + } else if (FoldOp->isFI()) { + FrameIndexToFold = FoldOp->getIndex(); + } else { + assert(FoldOp->isReg()); + OpToFold = FoldOp; + } + } + + bool isFI() const { + return Kind == MachineOperand::MO_FrameIndex; + } + + bool isImm() const { + return Kind == MachineOperand::MO_Immediate; + } + + bool isReg() const { + return Kind == MachineOperand::MO_Register; + } +}; + class SIFoldOperands : public MachineFunctionPass { public: static char ID; + MachineRegisterInfo *MRI; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + + void foldOperand(MachineOperand &OpToFold, + MachineInstr *UseMI, + unsigned UseOpIdx, + SmallVectorImpl<FoldCandidate> &FoldList, + SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; + + void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; public: SIFoldOperands() : MachineFunctionPass(ID) { @@ -36,9 +82,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { - return "SI Fold Operands"; - } + StringRef getPassName() const override { return "SI Fold Operands"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -46,29 +90,6 @@ public: } }; -struct FoldCandidate { - MachineInstr *UseMI; - unsigned UseOpNo; - MachineOperand *OpToFold; - uint64_t ImmToFold; - - FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : - UseMI(MI), UseOpNo(OpNo) { - - if (FoldOp->isImm()) { - OpToFold = nullptr; - ImmToFold = FoldOp->getImm(); - } else { - assert(FoldOp->isReg()); - OpToFold = FoldOp; - } - } - - bool isImm() const { - return !OpToFold; - } -}; - } // End anonymous namespace. INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, @@ -78,15 +99,50 @@ char SIFoldOperands::ID = 0; char &llvm::SIFoldOperandsID = SIFoldOperands::ID; +// Wrapper around isInlineConstant that understands special cases when +// instruction types are replaced during operand folding. +static bool isInlineConstantIfFolded(const SIInstrInfo *TII, + const MachineInstr &UseMI, + unsigned OpNo, + const MachineOperand &OpToFold) { + if (TII->isInlineConstant(UseMI, OpNo, OpToFold)) + return true; + + unsigned Opc = UseMI.getOpcode(); + switch (Opc) { + case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_MAC_F16_e64: { + // Special case for mac. Since this is replaced with mad when folded into + // src2, we need to check the legality for the final instruction. + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (static_cast<int>(OpNo) == Src2Idx) { + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + const MCInstrDesc &MadDesc + = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); + } + } + default: + return false; + } +} + FunctionPass *llvm::createSIFoldOperandsPass() { return new SIFoldOperands(); } -static bool isSafeToFold(unsigned Opcode) { - switch(Opcode) { +static bool isSafeToFold(const MachineInstr &MI) { + switch (MI.getOpcode()) { case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: - case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::V_MOV_B64_PSEUDO: { + // If there are additional implicit register operands, this may be used for + // register indexing so the source register operand isn't simply copied. + unsigned NumOps = MI.getDesc().getNumOperands() + + MI.getDesc().getNumImplicitUses(); + + return MI.getNumOperands() == NumOps; + } case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B64: case AMDGPU::COPY: @@ -107,6 +163,11 @@ static bool updateOperand(FoldCandidate &Fold, return true; } + if (Fold.isFI()) { + Old.ChangeToFrameIndex(Fold.FrameIndexToFold); + return true; + } + MachineOperand *New = Fold.OpToFold; if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && TargetRegisterInfo::isVirtualRegister(New->getReg())) { @@ -119,7 +180,7 @@ static bool updateOperand(FoldCandidate &Fold, return false; } -static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, +static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList, const MachineInstr *MI) { for (auto Candidate : FoldList) { if (Candidate.UseMI == MI) @@ -128,19 +189,21 @@ static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, return false; } -static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, +static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, const SIInstrInfo *TII) { if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { - // Special case for v_mac_f32_e64 if we are trying to fold into src2 + // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); - if (Opc == AMDGPU::V_MAC_F32_e64 && + if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) && (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { - // Check if changing this to a v_mad_f32 instruction will allow us to - // fold the operand. - MI->setDesc(TII->get(AMDGPU::V_MAD_F32)); + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + + // Check if changing this to a v_mad_{f16, f32} instruction will allow us + // to fold the operand. + MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16)); bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); if (FoldAsMAD) { MI->untieRegOperand(OpNo); @@ -149,6 +212,13 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, MI->setDesc(TII->get(Opc)); } + // Special case for s_setreg_b32 + if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) { + MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32)); + FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); + return true; + } + // If we are already folding into another operand of MI, then // we can't commute the instruction, otherwise we risk making the // other fold illegal. @@ -188,108 +258,432 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, return true; } -static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, - unsigned UseOpIdx, - std::vector<FoldCandidate> &FoldList, - SmallVectorImpl<MachineInstr *> &CopiesToReplace, - const SIInstrInfo *TII, const SIRegisterInfo &TRI, - MachineRegisterInfo &MRI) { +// If the use operand doesn't care about the value, this may be an operand only +// used for register indexing, in which case it is unsafe to fold. +static bool isUseSafeToFold(const MachineInstr &MI, + const MachineOperand &UseMO) { + return !UseMO.isUndef(); + //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); +} + +void SIFoldOperands::foldOperand( + MachineOperand &OpToFold, + MachineInstr *UseMI, + unsigned UseOpIdx, + SmallVectorImpl<FoldCandidate> &FoldList, + SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); + if (!isUseSafeToFold(*UseMI, UseOp)) + return; + // FIXME: Fold operands with subregs. - if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || - UseOp.isImplicit())) { + if (UseOp.isReg() && OpToFold.isReg()) { + if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister) + return; + + // Don't fold subregister extracts into tied operands, only if it is a full + // copy since a subregister use tied to a full register def doesn't really + // make sense. e.g. don't fold: + // + // %vreg1 = COPY %vreg0:sub1 + // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0> + // + // into + // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0> + if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) + return; + } + + // Special case for REG_SEQUENCE: We can't fold literals into + // REG_SEQUENCE instructions, so we have to fold them into the + // uses of REG_SEQUENCE. + if (UseMI->isRegSequence()) { + unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); + unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + + for (MachineRegisterInfo::use_iterator + RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); + RSUse != RSE; ++RSUse) { + + MachineInstr *RSUseMI = RSUse->getParent(); + if (RSUse->getSubReg() != RegSeqDstSubReg) + continue; + + foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, + CopiesToReplace); + } + return; } + bool FoldingImm = OpToFold.isImm(); - APInt Imm; - if (FoldingImm) { + // In order to fold immediates into copies, we need to change the + // copy to a MOV. + if (FoldingImm && UseMI->isCopy()) { + unsigned DestReg = UseMI->getOperand(0).getReg(); + const TargetRegisterClass *DestRC + = TargetRegisterInfo::isVirtualRegister(DestReg) ? + MRI->getRegClass(DestReg) : + TRI->getPhysRegClass(DestReg); + + unsigned MovOp = TII->getMovOpcode(DestRC); + if (MovOp == AMDGPU::COPY) + return; + + UseMI->setDesc(TII->get(MovOp)); + CopiesToReplace.push_back(UseMI); + } else { + const MCInstrDesc &UseDesc = UseMI->getDesc(); + + // Don't fold into target independent nodes. Target independent opcodes + // don't have defined register classes. + if (UseDesc.isVariadic() || + UseDesc.OpInfo[UseOpIdx].RegClass == -1) + return; + } + + if (!FoldingImm) { + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); + + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunites. The shrink operands pass + // already does this. + return; + } + + + const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); + const TargetRegisterClass *FoldRC = + TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); + + APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType), + OpToFold.getImm()); + + // Split 64-bit constants into 32-bits for folding. + if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { unsigned UseReg = UseOp.getReg(); const TargetRegisterClass *UseRC = TargetRegisterInfo::isVirtualRegister(UseReg) ? - MRI.getRegClass(UseReg) : - TRI.getPhysRegClass(UseReg); - - Imm = APInt(64, OpToFold.getImm()); + MRI->getRegClass(UseReg) : + TRI->getPhysRegClass(UseReg); - const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode()); - const TargetRegisterClass *FoldRC = - TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); + assert(Imm.getBitWidth() == 64); - // Split 64-bit constants into 32-bits for folding. - if (FoldRC->getSize() == 8 && UseOp.getSubReg()) { - if (UseRC->getSize() != 8) - return; + if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) + return; - if (UseOp.getSubReg() == AMDGPU::sub0) { - Imm = Imm.getLoBits(32); - } else { - assert(UseOp.getSubReg() == AMDGPU::sub1); - Imm = Imm.getHiBits(32); - } + if (UseOp.getSubReg() == AMDGPU::sub0) { + Imm = Imm.getLoBits(32); + } else { + assert(UseOp.getSubReg() == AMDGPU::sub1); + Imm = Imm.getHiBits(32); } + } - // In order to fold immediates into copies, we need to change the - // copy to a MOV. - if (UseMI->getOpcode() == AMDGPU::COPY) { - unsigned DestReg = UseMI->getOperand(0).getReg(); - const TargetRegisterClass *DestRC - = TargetRegisterInfo::isVirtualRegister(DestReg) ? - MRI.getRegClass(DestReg) : - TRI.getPhysRegClass(DestReg); - - unsigned MovOp = TII->getMovOpcode(DestRC); - if (MovOp == AMDGPU::COPY) - return; - - UseMI->setDesc(TII->get(MovOp)); - CopiesToReplace.push_back(UseMI); + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); +} + +static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, + uint32_t LHS, uint32_t RHS) { + switch (Opcode) { + case AMDGPU::V_AND_B32_e64: + case AMDGPU::V_AND_B32_e32: + case AMDGPU::S_AND_B32: + Result = LHS & RHS; + return true; + case AMDGPU::V_OR_B32_e64: + case AMDGPU::V_OR_B32_e32: + case AMDGPU::S_OR_B32: + Result = LHS | RHS; + return true; + case AMDGPU::V_XOR_B32_e64: + case AMDGPU::V_XOR_B32_e32: + case AMDGPU::S_XOR_B32: + Result = LHS ^ RHS; + return true; + case AMDGPU::V_LSHL_B32_e64: + case AMDGPU::V_LSHL_B32_e32: + case AMDGPU::S_LSHL_B32: + // The instruction ignores the high bits for out of bounds shifts. + Result = LHS << (RHS & 31); + return true; + case AMDGPU::V_LSHLREV_B32_e64: + case AMDGPU::V_LSHLREV_B32_e32: + Result = RHS << (LHS & 31); + return true; + case AMDGPU::V_LSHR_B32_e64: + case AMDGPU::V_LSHR_B32_e32: + case AMDGPU::S_LSHR_B32: + Result = LHS >> (RHS & 31); + return true; + case AMDGPU::V_LSHRREV_B32_e64: + case AMDGPU::V_LSHRREV_B32_e32: + Result = RHS >> (LHS & 31); + return true; + case AMDGPU::V_ASHR_I32_e64: + case AMDGPU::V_ASHR_I32_e32: + case AMDGPU::S_ASHR_I32: + Result = static_cast<int32_t>(LHS) >> (RHS & 31); + return true; + case AMDGPU::V_ASHRREV_I32_e64: + case AMDGPU::V_ASHRREV_I32_e32: + Result = static_cast<int32_t>(RHS) >> (LHS & 31); + return true; + default: + return false; + } +} + +static unsigned getMovOpc(bool IsScalar) { + return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; +} + +/// Remove any leftover implicit operands from mutating the instruction. e.g. +/// if we replace an s_and_b32 with a copy, we don't need the implicit scc def +/// anymore. +static void stripExtraCopyOperands(MachineInstr &MI) { + const MCInstrDesc &Desc = MI.getDesc(); + unsigned NumOps = Desc.getNumOperands() + + Desc.getNumImplicitUses() + + Desc.getNumImplicitDefs(); + + for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) + MI.RemoveOperand(I); +} + +static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { + MI.setDesc(NewDesc); + stripExtraCopyOperands(MI); +} + +static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, + MachineOperand &Op) { + if (Op.isReg()) { + // If this has a subregister, it obviously is a register source. + if (Op.getSubReg() != AMDGPU::NoSubRegister) + return &Op; + + MachineInstr *Def = MRI.getVRegDef(Op.getReg()); + if (Def->isMoveImmediate()) { + MachineOperand &ImmSrc = Def->getOperand(1); + if (ImmSrc.isImm()) + return &ImmSrc; } } - // Special case for REG_SEQUENCE: We can't fold literals into - // REG_SEQUENCE instructions, so we have to fold them into the - // uses of REG_SEQUENCE. - if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) { - unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); - unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + return &Op; +} - for (MachineRegisterInfo::use_iterator - RSUse = MRI.use_begin(RegSeqDstReg), - RSE = MRI.use_end(); RSUse != RSE; ++RSUse) { +// Try to simplify operations with a constant that may appear after instruction +// selection. +// TODO: See if a frame index with a fixed offset can fold. +static bool tryConstantFoldOp(MachineRegisterInfo &MRI, + const SIInstrInfo *TII, + MachineInstr *MI, + MachineOperand *ImmOp) { + unsigned Opc = MI->getOpcode(); + if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || + Opc == AMDGPU::S_NOT_B32) { + MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm()); + mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); + return true; + } - MachineInstr *RSUseMI = RSUse->getParent(); - if (RSUse->getSubReg() != RegSeqDstSubReg) - continue; + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx == -1) + return false; - foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, - CopiesToReplace, TII, TRI, MRI); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx)); + MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx)); + + if (!Src0->isImm() && !Src1->isImm()) + return false; + + // and k0, k1 -> v_mov_b32 (k0 & k1) + // or k0, k1 -> v_mov_b32 (k0 | k1) + // xor k0, k1 -> v_mov_b32 (k0 ^ k1) + if (Src0->isImm() && Src1->isImm()) { + int32_t NewImm; + if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) + return false; + + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg()); + + // Be careful to change the right operand, src0 may belong to a different + // instruction. + MI->getOperand(Src0Idx).ChangeToImmediate(NewImm); + MI->RemoveOperand(Src1Idx); + mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); + return true; + } + + if (!MI->isCommutable()) + return false; + + if (Src0->isImm() && !Src1->isImm()) { + std::swap(Src0, Src1); + std::swap(Src0Idx, Src1Idx); + } + + int32_t Src1Val = static_cast<int32_t>(Src1->getImm()); + if (Opc == AMDGPU::V_OR_B32_e64 || + Opc == AMDGPU::V_OR_B32_e32 || + Opc == AMDGPU::S_OR_B32) { + if (Src1Val == 0) { + // y = or x, 0 => y = copy x + MI->RemoveOperand(Src1Idx); + mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + } else if (Src1Val == -1) { + // y = or x, -1 => y = v_mov_b32 -1 + MI->RemoveOperand(Src1Idx); + mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); + } else + return false; + + return true; + } + + if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 || + MI->getOpcode() == AMDGPU::V_AND_B32_e32 || + MI->getOpcode() == AMDGPU::S_AND_B32) { + if (Src1Val == 0) { + // y = and x, 0 => y = v_mov_b32 0 + MI->RemoveOperand(Src0Idx); + mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); + } else if (Src1Val == -1) { + // y = and x, -1 => y = copy x + MI->RemoveOperand(Src1Idx); + mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + stripExtraCopyOperands(*MI); + } else + return false; + + return true; + } + + if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 || + MI->getOpcode() == AMDGPU::V_XOR_B32_e32 || + MI->getOpcode() == AMDGPU::S_XOR_B32) { + if (Src1Val == 0) { + // y = xor x, 0 => y = copy x + MI->RemoveOperand(Src1Idx); + mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + return true; } - return; } - const MCInstrDesc &UseDesc = UseMI->getDesc(); + return false; +} - // Don't fold into target independent nodes. Target independent opcodes - // don't have defined register classes. - if (UseDesc.isVariadic() || - UseDesc.OpInfo[UseOpIdx].RegClass == -1) - return; +void SIFoldOperands::foldInstOperand(MachineInstr &MI, + MachineOperand &OpToFold) const { + // We need mutate the operands of new mov instructions to add implicit + // uses of EXEC, but adding them invalidates the use_iterator, so defer + // this. + SmallVector<MachineInstr *, 4> CopiesToReplace; + SmallVector<FoldCandidate, 4> FoldList; + MachineOperand &Dst = MI.getOperand(0); + bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); if (FoldingImm) { - MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); - return; - } + unsigned NumLiteralUses = 0; + MachineOperand *NonInlineUse = nullptr; + int NonInlineUseOpNo = -1; + + MachineRegisterInfo::use_iterator NextUse, NextInstUse; + for (MachineRegisterInfo::use_iterator + Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); + Use != E; Use = NextUse) { + NextUse = std::next(Use); + MachineInstr *UseMI = Use->getParent(); + unsigned OpNo = Use.getOperandNo(); + + // Folding the immediate may reveal operations that can be constant + // folded or replaced with a copy. This can happen for example after + // frame indices are lowered to constants or from splitting 64-bit + // constants. + // + // We may also encounter cases where one or both operands are + // immediates materialized into a register, which would ordinarily not + // be folded due to multiple uses or operand constraints. + + if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) { + DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n'); + + // Some constant folding cases change the same immediate's use to a new + // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user + // again. The same constant folded instruction could also have a second + // use operand. + NextUse = MRI->use_begin(Dst.getReg()); + continue; + } + + // Try to fold any inline immediate uses, and then only fold other + // constants if they have one use. + // + // The legality of the inline immediate must be checked based on the use + // operand, not the defining instruction, because 32-bit instructions + // with 32-bit inline immediate sources may be used to materialize + // constants used in 16-bit operands. + // + // e.g. it is unsafe to fold: + // s_mov_b32 s0, 1.0 // materializes 0x3f800000 + // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 + + // Folding immediates with more than one use will increase program size. + // FIXME: This will also reduce register usage, which may be better + // in some cases. A better heuristic is needed. + if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { + foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); + } else { + if (++NumLiteralUses == 1) { + NonInlineUse = &*Use; + NonInlineUseOpNo = OpNo; + } + } + } + + if (NumLiteralUses == 1) { + MachineInstr *UseMI = NonInlineUse->getParent(); + foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); + } + } else { + // Folding register. + for (MachineRegisterInfo::use_iterator + Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); + Use != E; ++Use) { + MachineInstr *UseMI = Use->getParent(); - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); + foldOperand(OpToFold, UseMI, Use.getOperandNo(), + FoldList, CopiesToReplace); + } + } - // FIXME: We could try to change the instruction from 64-bit to 32-bit - // to enable more folding opportunites. The shrink operands pass - // already does this. - return; + MachineFunction *MF = MI.getParent()->getParent(); + // Make sure we add EXEC uses to any new v_mov instructions created. + for (MachineInstr *Copy : CopiesToReplace) + Copy->addImplicitDefUseOperands(*MF); + + for (FoldCandidate &Fold : FoldList) { + if (updateOperand(Fold, *TRI)) { + // Clear kill flags. + if (Fold.isReg()) { + assert(Fold.OpToFold && Fold.OpToFold->isReg()); + // FIXME: Probably shouldn't bother trying to fold if not an + // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR + // copies. + MRI->clearKillFlags(Fold.OpToFold->getReg()); + } + DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << + static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); + } + } } bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { @@ -298,12 +692,12 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { + BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; @@ -311,25 +705,16 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (!isSafeToFold(MI.getOpcode())) + if (!isSafeToFold(MI)) continue; - unsigned OpSize = TII->getOpSize(MI, 1); MachineOperand &OpToFold = MI.getOperand(1); - bool FoldingImm = OpToFold.isImm(); + bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); - // FIXME: We could also be folding things like FrameIndexes and - // TargetIndexes. + // FIXME: We could also be folding things like TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) continue; - // Folding immediates with more than one use will increase program size. - // FIXME: This will also reduce register usage, which may be better - // in some cases. A better heuristic is needed. - if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && - !MRI.hasOneUse(MI.getOperand(0).getReg())) - continue; - if (OpToFold.isReg() && !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) continue; @@ -345,40 +730,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) continue; - // We need mutate the operands of new mov instructions to add implicit - // uses of EXEC, but adding them invalidates the use_iterator, so defer - // this. - SmallVector<MachineInstr *, 4> CopiesToReplace; - - std::vector<FoldCandidate> FoldList; - for (MachineRegisterInfo::use_iterator - Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); - Use != E; ++Use) { - - MachineInstr *UseMI = Use->getParent(); - - foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, - CopiesToReplace, TII, TRI, MRI); - } - - // Make sure we add EXEC uses to any new v_mov instructions created. - for (MachineInstr *Copy : CopiesToReplace) - Copy->addImplicitDefUseOperands(MF); - - for (FoldCandidate &Fold : FoldList) { - if (updateOperand(Fold, TRI)) { - // Clear kill flags. - if (!Fold.isImm()) { - assert(Fold.OpToFold && Fold.OpToFold->isReg()); - // FIXME: Probably shouldn't bother trying to fold if not an - // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR - // copies. - MRI.clearKillFlags(Fold.OpToFold->getReg()); - } - DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << - Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); - } - } + foldInstOperand(MI, OpToFold); } } return false; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 03b11f0..0b57155 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -21,20 +21,168 @@ using namespace llvm; -static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, - const MachineFrameInfo *FrameInfo) { - return FuncInfo->hasSpilledSGPRs() && - (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects()); -} - -static ArrayRef<MCPhysReg> getAllSGPR128() { +static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF, + const SIRegisterInfo *TRI) { return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), - AMDGPU::SGPR_128RegClass.getNumRegs()); + TRI->getMaxNumSGPRs(MF) / 4); } -static ArrayRef<MCPhysReg> getAllSGPRs() { +static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF, + const SIRegisterInfo *TRI) { return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), - AMDGPU::SGPR_32RegClass.getNumRegs()); + TRI->getMaxNumSGPRs(MF)); +} + +void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII, + const SIRegisterInfo* TRI, + MachineFunction &MF, + MachineBasicBlock &MBB) const { + // We don't need this if we only have spills since there is no user facing + // scratch. + + // TODO: If we know we don't have flat instructions earlier, we can omit + // this from the input registers. + // + // TODO: We only need to know if we access scratch space through a flat + // pointer. Because we only detect if flat instructions are used at all, + // this will be used more often than necessary on VI. + + // Debug location must be unknown since the first debug location is used to + // determine the end of the prologue. + DebugLoc DL; + MachineBasicBlock::iterator I = MBB.begin(); + + unsigned FlatScratchInitReg + = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); + + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(FlatScratchInitReg); + MBB.addLiveIn(FlatScratchInitReg); + + // Copy the size in bytes. + unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitHi, RegState::Kill); + + unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + + // Add wave offset in bytes to private base offset. + // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); + + // Convert offset to 256-byte units. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) + .addReg(FlatScrInitLo, RegState::Kill) + .addImm(8); +} + +unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( + const SISubtarget &ST, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + SIMachineFunctionInfo *MFI, + MachineFunction &MF) const { + + // We need to insert initialization of the scratch resource descriptor. + unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); + if (ScratchRsrcReg == AMDGPU::NoRegister) + return AMDGPU::NoRegister; + + if (ST.hasSGPRInitBug() || + ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) + return ScratchRsrcReg; + + // We reserved the last registers for this. Shift it down to the end of those + // which were actually used. + // + // FIXME: It might be safer to use a pseudoregister before replacement. + + // FIXME: We should be able to eliminate unused input registers. We only + // cannot do this for the resources required for scratch access. For now we + // skip over user SGPRs and may leave unused holes. + + // We find the resource first because it has an alignment requirement. + + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; + ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI); + AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); + + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + for (MCPhysReg Reg : AllSGPR128s) { + // Pick the first unallocated one. Make sure we don't clobber the other + // reserved input we needed. + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { + //assert(MRI.isAllocatable(Reg)); + MRI.replaceRegWith(ScratchRsrcReg, Reg); + MFI->setScratchRSrcReg(Reg); + return Reg; + } + } + + return ScratchRsrcReg; +} + +unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( + const SISubtarget &ST, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + SIMachineFunctionInfo *MFI, + MachineFunction &MF) const { + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + if (ST.hasSGPRInitBug() || + ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) + return ScratchWaveOffsetReg; + + unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + + ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI); + if (NumPreloaded > AllSGPRs.size()) + return ScratchWaveOffsetReg; + + AllSGPRs = AllSGPRs.slice(NumPreloaded); + + // We need to drop register from the end of the list that we cannot use + // for the scratch wave offset. + // + 2 s102 and s103 do not exist on VI. + // + 2 for vcc + // + 2 for xnack_mask + // + 2 for flat_scratch + // + 4 for registers reserved for scratch resource register + // + 1 for register reserved for scratch wave offset. (By exluding this + // register from the list to consider, it means that when this + // register is being used for the scratch wave offset and there + // are no other free SGPRs, then the value will stay in this register. + // ---- + // 13 + if (AllSGPRs.size() < 13) + return ScratchWaveOffsetReg; + + for (MCPhysReg Reg : AllSGPRs.drop_back(13)) { + // Pick the first unallocated SGPR. Be careful not to pick an alias of the + // scratch descriptor, since we haven’t added its uses yet. + if (!MRI.isPhysRegUsed(Reg)) { + if (!MRI.isAllocatable(Reg) || + TRI->isSubRegisterEq(ScratchRsrcReg, Reg)) + continue; + + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + MFI->setScratchWaveOffsetReg(Reg); + return Reg; + } + } + + return ScratchWaveOffsetReg; } void SIFrameLowering::emitPrologue(MachineFunction &MF, @@ -45,9 +193,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (ST.debuggerEmitPrologue()) emitDebuggerPrologue(MF, MBB); - if (!MF.getFrameInfo()->hasStackObjects()) - return; - assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -57,200 +202,159 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // // FIXME: We should be cleaning up these unused SGPR spill frame indices // somewhere. - if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) - return; const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineBasicBlock::iterator I = MBB.begin(); - - // We need to insert initialization of the scratch resource descriptor. - unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); - assert(ScratchRsrcReg != AMDGPU::NoRegister); - - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - assert(ScratchWaveOffsetReg != AMDGPU::NoRegister); - unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + unsigned ScratchRsrcReg + = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); + unsigned ScratchWaveOffsetReg + = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); - unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdHsaOS()) { - PreloadedPrivateBufferReg = TRI->getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + if (ScratchRsrcReg == AMDGPU::NoRegister) { + assert(ScratchWaveOffsetReg == AMDGPU::NoRegister); + return; } - if (MFI->hasFlatScratchInit()) { - // We don't need this if we only have spills since there is no user facing - // scratch. - - // TODO: If we know we don't have flat instructions earlier, we can omit - // this from the input registers. - // - // TODO: We only need to know if we access scratch space through a flat - // pointer. Because we only detect if flat instructions are used at all, - // this will be used more often than necessary on VI. - - // Debug location must be unknown since the first debug location is used to - // determine the end of the prologue. - DebugLoc DL; - - unsigned FlatScratchInitReg - = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); + assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); - MRI.addLiveIn(FlatScratchInitReg); - MBB.addLiveIn(FlatScratchInitReg); + // We need to do the replacement of the private segment buffer and wave offset + // register even if there are no stack objects. There could be stores to undef + // or a constant without an associated object. - // Copy the size in bytes. - unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO) - .addReg(FlatScrInitHi, RegState::Kill); + // FIXME: We still have implicit uses on SGPR spill instructions in case they + // need to spill to vector memory. It's likely that will not happen, but at + // this point it appears we need the setup. This part of the prolog should be + // emitted after frame indices are eliminated. - unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) + emitFlatScratchInit(TII, TRI, MF, MBB); - // Add wave offset in bytes to private base offset. - // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) - .addReg(FlatScrInitLo) - .addReg(ScratchWaveOffsetReg); + // We need to insert initialization of the scratch resource descriptor. + unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - // Convert offset to 256-byte units. - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) - .addReg(FlatScrInitLo, RegState::Kill) - .addImm(8); - } - // If we reserved the original input registers, we don't need to copy to the - // reserved registers. - if (ScratchRsrcReg == PreloadedPrivateBufferReg) { - // We should always reserve these 5 registers at the same time. - assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg && - "scratch wave offset and private segment buffer inconsistent"); - return; + unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; + if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) { + PreloadedPrivateBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } + bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg); + bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg); // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. - MRI.addLiveIn(PreloadedScratchWaveOffsetReg); - MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + if (OffsetRegUsed) { + assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister && + "scratch wave offset input is required"); + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + } - if (ST.isAmdHsaOS()) { + if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { + assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)); MRI.addLiveIn(PreloadedPrivateBufferReg); MBB.addLiveIn(PreloadedPrivateBufferReg); } - if (!ST.hasSGPRInitBug()) { - // We reserved the last registers for this. Shift it down to the end of those - // which were actually used. - // - // FIXME: It might be safer to use a pseudoregister before replacement. - - // FIXME: We should be able to eliminate unused input registers. We only - // cannot do this for the resources required for scratch access. For now we - // skip over user SGPRs and may leave unused holes. - - // We find the resource first because it has an alignment requirement. - if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; - // Skip the last 2 elements because the last one is reserved for VCC, and - // this is the 2nd to last element already. - for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { - // Pick the first unallocated one. Make sure we don't clobber the other - // reserved input we needed. - if (!MRI.isPhysRegUsed(Reg)) { - assert(MRI.isAllocatable(Reg)); - MRI.replaceRegWith(ScratchRsrcReg, Reg); - ScratchRsrcReg = Reg; - MFI->setScratchRSrcReg(ScratchRsrcReg); - break; - } - } - } + // Make the register selected live throughout the function. + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB == &MBB) + continue; - if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - - // We need to drop register from the end of the list that we cannot use - // for the scratch wave offset. - // + 2 s102 and s103 do not exist on VI. - // + 2 for vcc - // + 2 for xnack_mask - // + 2 for flat_scratch - // + 4 for registers reserved for scratch resource register - // + 1 for register reserved for scratch wave offset. (By exluding this - // register from the list to consider, it means that when this - // register is being used for the scratch wave offset and there - // are no other free SGPRs, then the value will stay in this register. - // ---- - // 13 - for (MCPhysReg Reg : getAllSGPRs().drop_back(13).slice(NumPreloaded)) { - // Pick the first unallocated SGPR. Be careful not to pick an alias of the - // scratch descriptor, since we haven’t added its uses yet. - if (!MRI.isPhysRegUsed(Reg)) { - if (!MRI.isAllocatable(Reg) || - TRI->isSubRegisterEq(ScratchRsrcReg, Reg)) - continue; - - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - ScratchWaveOffsetReg = Reg; - MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); - break; - } - } - } + if (OffsetRegUsed) + OtherBB.addLiveIn(ScratchWaveOffsetReg); + + if (ResourceRegUsed) + OtherBB.addLiveIn(ScratchRsrcReg); } + DebugLoc DL; + MachineBasicBlock::iterator I = MBB.begin(); - assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); + // If we reserved the original input registers, we don't need to copy to the + // reserved registers. - const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); - DebugLoc DL; + bool CopyBuffer = ResourceRegUsed && + PreloadedPrivateBufferReg != AMDGPU::NoRegister && + ST.isAmdCodeObjectV2(MF) && + ScratchRsrcReg != PreloadedPrivateBufferReg; + + // This needs to be careful of the copying order to avoid overwriting one of + // the input registers before it's been copied to it's final + // destination. Usually the offset should be copied first. + bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg, + ScratchWaveOffsetReg); + if (CopyBuffer && CopyBufferFirst) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) + .addReg(PreloadedPrivateBufferReg, RegState::Kill); + } - if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { - // Make sure we emit the copy for the offset first. We may have chosen to copy - // the buffer resource into a register that aliases the input offset register. - BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg) + if (OffsetRegUsed && + PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); } - if (ST.isAmdHsaOS()) { - // Insert copies from argument register. - assert( - !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) && - !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg)); - - unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); - unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3); - - unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1); - unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3); + if (CopyBuffer && !CopyBufferFirst) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) + .addReg(PreloadedPrivateBufferReg, RegState::Kill); + } - const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64); + if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) { + assert(!ST.isAmdCodeObjectV2(MF)); + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); - BuildMI(MBB, I, DL, SMovB64, Rsrc01) - .addReg(Lo, RegState::Kill); - BuildMI(MBB, I, DL, SMovB64, Rsrc23) - .addReg(Hi, RegState::Kill); - } else { - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); // Use relocations to get the pointer, and setup the other bits manually. uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - BuildMI(MBB, I, DL, SMovB32, Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - BuildMI(MBB, I, DL, SMovB32, Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + if (MFI->hasPrivateMemoryInputPtr()) { + unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + + if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { + const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); + + BuildMI(MBB, I, DL, Mov64, Rsrc01) + .addReg(PreloadedPrivateBufferReg) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } else { + const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); + + PointerType *PtrTy = + PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()), + AMDGPUAS::CONSTANT_ADDRESS); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + auto MMO = MF.getMachineMemOperand(PtrInfo, + MachineMemOperand::MOLoad | + MachineMemOperand::MOInvariant | + MachineMemOperand::MODereferenceable, + 0, 0); + BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) + .addReg(PreloadedPrivateBufferReg) + .addImm(0) // offset + .addImm(0) // glc + .addMemOperand(MMO) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } + } else { + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + + BuildMI(MBB, I, DL, SMovB32, Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + } BuildMI(MBB, I, DL, SMovB32, Rsrc2) .addImm(Rsrc23 & 0xffffffff) @@ -260,15 +364,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .addImm(Rsrc23 >> 32) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } - - // Make the register selected live throughout the function. - for (MachineBasicBlock &OtherBB : MF) { - if (&OtherBB == &MBB) - continue; - - OtherBB.addLiveIn(ScratchRsrcReg); - OtherBB.addLiveIn(ScratchWaveOffsetReg); - } } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -279,20 +374,20 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); - if (!MFI->hasStackObjects()) + if (!MFI.hasStackObjects()) return; - bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects(); + bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects(); assert((RS || !MayNeedScavengingEmergencySlot) && "RegScavenger required if spilling"); if (MayNeedScavengingEmergencySlot) { - int ScavengeFI = MFI->CreateSpillStackObject( + int ScavengeFI = MFI.CreateStackObject( AMDGPU::SGPR_32RegClass.getSize(), - AMDGPU::SGPR_32RegClass.getAlignment()); + AMDGPU::SGPR_32RegClass.getAlignment(), false); RS->addScavengingFrameIndex(ScavengeFI); } } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 37417d0..7657b4e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -14,12 +14,17 @@ namespace llvm { +class SIInstrInfo; +class SIMachineFunctionInfo; +class SIRegisterInfo; +class SISubtarget; + class SIFrameLowering final : public AMDGPUFrameLowering { public: SIFrameLowering(StackDirection D, unsigned StackAl, int LAO, unsigned TransAl = 1) : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} - ~SIFrameLowering() override {} + ~SIFrameLowering() override = default; void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; @@ -31,10 +36,29 @@ public: RegScavenger *RS = nullptr) const override; private: + void emitFlatScratchInit(const SIInstrInfo *TII, + const SIRegisterInfo* TRI, + MachineFunction &MF, + MachineBasicBlock &MBB) const; + + unsigned getReservedPrivateSegmentBufferReg( + const SISubtarget &ST, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + SIMachineFunctionInfo *MFI, + MachineFunction &MF) const; + + unsigned getReservedPrivateSegmentWaveByteOffsetReg( + const SISubtarget &ST, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + SIMachineFunctionInfo *MFI, + MachineFunction &MF) const; + /// \brief Emits debugger prologue. void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const; }; -} +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 80d4435..b98f9f4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -21,6 +21,7 @@ #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "SIDefines.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -31,17 +32,18 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" using namespace llvm; -// -amdgpu-fast-fdiv - Command line option to enable faster 2.5 ulp fdiv. -static cl::opt<bool> EnableAMDGPUFastFDIV( - "amdgpu-fast-fdiv", - cl::desc("Enable faster 2.5 ulp fdiv"), +static cl::opt<bool> EnableVGPRIndexMode( + "amdgpu-vgpr-index-mode", + cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); + static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { @@ -58,7 +60,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); @@ -77,6 +79,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + if (Subtarget->has16BitInsts()) { + addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); + } + computeRegisterProperties(STI.getRegisterInfo()); // We need to custom lower vector stores from local memory @@ -92,9 +99,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand); + setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::FrameIndex, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); @@ -111,6 +129,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::i1, Promote); setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::SETCC, MVT::v4i1, Expand); + AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32); setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); @@ -159,6 +178,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } } + // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that + // is expanded to avoid having two separate loops in case the index is a VGPR. + // Most operations are naturally 32-bit vector operations. We only support // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { @@ -218,6 +240,83 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::Constant, MVT::i16, Legal); + + setOperationAction(ISD::SMIN, MVT::i16, Legal); + setOperationAction(ISD::SMAX, MVT::i16, Legal); + + setOperationAction(ISD::UMIN, MVT::i16, Legal); + setOperationAction(ISD::UMAX, MVT::i16, Legal); + + setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote); + AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); + + setOperationAction(ISD::ROTR, MVT::i16, Promote); + setOperationAction(ISD::ROTL, MVT::i16, Promote); + + setOperationAction(ISD::SDIV, MVT::i16, Promote); + setOperationAction(ISD::UDIV, MVT::i16, Promote); + setOperationAction(ISD::SREM, MVT::i16, Promote); + setOperationAction(ISD::UREM, MVT::i16, Promote); + + setOperationAction(ISD::BSWAP, MVT::i16, Promote); + setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); + + setOperationAction(ISD::CTTZ, MVT::i16, Promote); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote); + setOperationAction(ISD::CTLZ, MVT::i16, Promote); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote); + + setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); + + setOperationAction(ISD::BR_CC, MVT::i16, Expand); + + setOperationAction(ISD::LOAD, MVT::i16, Custom); + + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + + setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); + AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); + setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); + AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); + + setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); + setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); + + // F16 - Constant Actions. + setOperationAction(ISD::ConstantFP, MVT::f16, Legal); + + // F16 - Load/Store Actions. + setOperationAction(ISD::LOAD, MVT::f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16); + setOperationAction(ISD::STORE, MVT::f16, Promote); + AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); + + // F16 - VOP1 Actions. + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote); + setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote); + + // F16 - VOP2 Actions. + setOperationAction(ISD::BR_CC, MVT::f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); + setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); + setOperationAction(ISD::FMINNUM, MVT::f16, Legal); + setOperationAction(ISD::FDIV, MVT::f16, Custom); + + // F16 - VOP3 Actions. + setOperationAction(ISD::FMA, MVT::f16, Legal); + if (!Subtarget->hasFP16Denormals()) + setOperationAction(ISD::FMAD, MVT::f16, Legal); + } + setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -229,6 +328,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::FCANONICALIZE); @@ -357,6 +458,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, case AMDGPUAS::CONSTANT_ADDRESS: { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. + // FIXME: Can we get the real alignment here? if (AM.BaseOffs % 4 != 0) return isLegalMUBUFAddressingMode(AM); @@ -435,8 +537,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, // which isn't a simple VT. - if (!VT.isSimple() || VT == MVT::Other) + // Until MVT is extended to handle this, simply check for the size and + // rely on the condition below: allow accesses if the size is a multiple of 4. + if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && + VT.getStoreSize() > 16)) { return false; + } if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) { @@ -450,6 +556,15 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return AlignedBy4; } + // FIXME: We have to be conservative here and assume that flat operations + // will access scratch. If we had access to the IR function, then we + // could determine if any private memory was used in the function. + if (!Subtarget->hasUnalignedScratchAccess() && + (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || + AddrSpace == AMDGPUAS::FLAT_ADDRESS)) { + return false; + } + if (Subtarget->hasUnalignedBufferAccess()) { // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. @@ -496,8 +611,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, static bool isFlatGlobalAddrSpace(unsigned AS) { return AS == AMDGPUAS::GLOBAL_ADDRESS || - AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS; + AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; } bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, @@ -505,6 +620,23 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); } +bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { + const MemSDNode *MemNode = cast<MemSDNode>(N); + const Value *Ptr = MemNode->getMemOperand()->getValue(); + const Instruction *I = dyn_cast<Instruction>(Ptr); + return I && I->getMetadata("amdgpu.noclobber"); +} + +bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + // Flat -> private/local is a simple truncate. + // Flat -> global is no-op + if (SrcAS == AMDGPUAS::FLAT_ADDRESS) + return true; + + return isNoopAddrSpaceCast(SrcAS, DestAS); +} + bool SITargetLowering::isMemOpUniform(const SDNode *N) const { const MemSDNode *MemNode = cast<MemSDNode>(N); const Value *Ptr = MemNode->getMemOperand()->getValue(); @@ -531,11 +663,27 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const { bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - return TII->isInlineConstant(Imm); + // FIXME: Could be smarter if called for vector constants. + return true; } bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { + if (Subtarget->has16BitInsts() && VT == MVT::i16) { + switch (Op) { + case ISD::LOAD: + case ISD::STORE: + + // These operations are done with 32-bit instructions anyway. + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::SELECT: + // TODO: Extensions? + return true; + default: + return false; + } + } // SimplifySetCC uses this function to determine whether or not it should // create setcc with i1 operands. We don't have instructions for i1 setcc. @@ -560,26 +708,39 @@ SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Offset, SL, PtrVT)); } + SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, - unsigned Offset, bool Signed) const { + unsigned Offset, bool Signed, + const ISD::InputArg *Arg) const { const DataLayout &DL = DAG.getDataLayout(); - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); - MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); - SDValue PtrOffset = DAG.getUNDEF(PtrVT); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); unsigned Align = DL.getABITypeAlignment(Ty); - ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset); + SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, + MachineMemOperand::MONonTemporal | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); + + SDValue Val = Load; + if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && + VT.bitsLT(MemVT)) { + unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; + Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT)); + } + if (MemVT.isFloatingPoint()) - ExtTy = ISD::EXTLOAD; + Val = getFPExtOrFPTrunc(DAG, Val, SL, VT); + else if (Signed) + Val = DAG.getSExtOrTrunc(Val, SL, VT); + else + Val = DAG.getZExtOrTrunc(Val, SL, VT); - SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset); - return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset, - PtrInfo, MemVT, Align, MachineMemOperand::MONonTemporal | - MachineMemOperand::MOInvariant); + return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); } SDValue SITargetLowering::LowerFormalArguments( @@ -679,12 +840,9 @@ SDValue SITargetLowering::LowerFormalArguments( } if (!AMDGPU::isShader(CallConv)) { - getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, - Splits); - assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); } else { - assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() && + assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && @@ -692,6 +850,12 @@ SDValue SITargetLowering::LowerFormalArguments( !Info->hasWorkItemIDZ()); } + if (Info->hasPrivateMemoryInputPtr()) { + unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI); + MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(PrivateMemoryPtrReg); + } + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? if (Info->hasPrivateSegmentBuffer()) { unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); @@ -701,29 +865,38 @@ SDValue SITargetLowering::LowerFormalArguments( if (Info->hasDispatchPtr()) { unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); - MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } if (Info->hasQueuePtr()) { unsigned QueuePtrReg = Info->addQueuePtr(*TRI); - MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } if (Info->hasKernargSegmentPtr()) { unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); - MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(InputPtrReg); } + if (Info->hasDispatchID()) { + unsigned DispatchIDReg = Info->addDispatchID(*TRI); + MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(DispatchIDReg); + } + if (Info->hasFlatScratchInit()) { unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); - MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); } - AnalyzeFormalArguments(CCInfo, Splits); + if (!AMDGPU::isShader(CallConv)) + analyzeFormalArgumentsCompute(CCInfo, Ins); + else + AnalyzeFormalArguments(CCInfo, Splits); SmallVector<SDValue, 16> Chains; @@ -740,13 +913,14 @@ SDValue SITargetLowering::LowerFormalArguments( if (VA.isMemLoc()) { VT = Ins[i].VT; - EVT MemVT = Splits[i].VT; - const unsigned Offset = Subtarget->getExplicitKernelArgOffset() + + EVT MemVT = VA.getLocVT(); + const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset(); // The first 36 bytes of the input buffer contains information about // thread group and global sizes. SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, - Offset, Ins[i].Flags.isSExt()); + Offset, Ins[i].Flags.isSExt(), + &Ins[i]); Chains.push_back(Arg.getValue(1)); auto *ParamTy = @@ -761,7 +935,7 @@ SDValue SITargetLowering::LowerFormalArguments( } InVals.push_back(Arg); - Info->ABIArgOffset = Offset + MemVT.getStoreSize(); + Info->setABIArgOffset(Offset + MemVT.getStoreSize()); continue; } assert(VA.isRegLoc() && "Parameter must be in a register!"); @@ -771,8 +945,8 @@ SDValue SITargetLowering::LowerFormalArguments( if (VT == MVT::i64) { // For now assume it is a pointer Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, - &AMDGPU::SReg_64RegClass); - Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); + &AMDGPU::SGPR_64RegClass); + Reg = MF.addLiveIn(Reg, &AMDGPU::SGPR_64RegClass); SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); InVals.push_back(Copy); continue; @@ -816,25 +990,25 @@ SDValue SITargetLowering::LowerFormalArguments( // Start adding system SGPRs. if (Info->hasWorkGroupIDX()) { unsigned Reg = Info->addWorkGroupIDX(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } if (Info->hasWorkGroupIDY()) { unsigned Reg = Info->addWorkGroupIDY(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } if (Info->hasWorkGroupIDZ()) { unsigned Reg = Info->addWorkGroupIDZ(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } if (Info->hasWorkGroupInfo()) { unsigned Reg = Info->addWorkGroupInfo(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } @@ -854,18 +1028,22 @@ SDValue SITargetLowering::LowerFormalArguments( // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. - bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + bool HasStackObjects = MF.getFrameInfo().hasStackObjects(); // Record that we know we have non-spill stack objects so we don't need to // check all stack objects later. if (HasStackObjects) Info->setHasNonSpillStackObjects(true); - if (ST.isAmdHsaOS()) { - // TODO: Assume we will spill without optimizations. + // Everything live out of a block is spilled with fast regalloc, so it's + // almost certain that spilling will be required. + if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + HasStackObjects = true; + + if (ST.isAmdCodeObjectV2(MF)) { if (HasStackObjects) { // If we have stack objects, we unquestionably need the private buffer - // resource. For the HSA ABI, this will be the first 4 user SGPR - // inputs. We can reserve those and use them directly. + // resource. For the Code Object V2 ABI, this will be the first 4 user + // SGPR inputs. We can reserve those and use them directly. unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); @@ -1088,64 +1266,551 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, MachineBasicBlock *SplitBB = MF->CreateMachineBasicBlock(BB->getBasicBlock()); - // Fix the block phi references to point to the new block for the defs in the - // second piece of the block. - for (MachineBasicBlock *Succ : BB->successors()) { - for (MachineInstr &MI : *Succ) { - if (!MI.isPHI()) - break; - - for (unsigned I = 2, E = MI.getNumOperands(); I != E; I += 2) { - MachineOperand &FromBB = MI.getOperand(I); - if (BB == FromBB.getMBB()) { - FromBB.setMBB(SplitBB); - break; - } - } - } - } - MF->insert(++MachineFunction::iterator(BB), SplitBB); SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); - SplitBB->transferSuccessors(BB); + SplitBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(SplitBB); MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); return SplitBB; } +// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the +// wavefront. If the value is uniform and just happens to be in a VGPR, this +// will only do one iteration. In the worst case, this will loop 64 times. +// +// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. +static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( + const SIInstrInfo *TII, + MachineRegisterInfo &MRI, + MachineBasicBlock &OrigBB, + MachineBasicBlock &LoopBB, + const DebugLoc &DL, + const MachineOperand &IdxReg, + unsigned InitReg, + unsigned ResultReg, + unsigned PhiReg, + unsigned InitSaveExecReg, + int Offset, + bool UseGPRIdxMode) { + MachineBasicBlock::iterator I = LoopBB.begin(); + + unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) + .addReg(InitReg) + .addMBB(&OrigBB) + .addReg(ResultReg) + .addMBB(&LoopBB); + + BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec) + .addReg(InitSaveExecReg) + .addMBB(&OrigBB) + .addReg(NewExec) + .addMBB(&LoopBB); + + // Read the next variant <- also loop target. + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) + .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); + + // Compare the just read M0 value to all possible Idx values. + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) + .addReg(CurrentIdxReg) + .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); + + if (UseGPRIdxMode) { + unsigned IdxReg; + if (Offset == 0) { + IdxReg = CurrentIdxReg; + } else { + IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg) + .addReg(CurrentIdxReg, RegState::Kill) + .addImm(Offset); + } + + MachineInstr *SetIdx = + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX)) + .addReg(IdxReg, RegState::Kill); + SetIdx->getOperand(2).setIsUndef(); + } else { + // Move index from VCC into M0 + if (Offset == 0) { + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(CurrentIdxReg, RegState::Kill); + } else { + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(CurrentIdxReg, RegState::Kill) + .addImm(Offset); + } + } + + // Update EXEC, save the original EXEC value to VCC. + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) + .addReg(CondReg, RegState::Kill); + + MRI.setSimpleHint(NewExec, CondReg); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1. + MachineInstr *InsertPt = + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(NewExec); + + // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use + // s_cbranch_scc0? + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(&LoopBB); + + return InsertPt->getIterator(); +} + +// This has slightly sub-optimal regalloc when the source vector is killed by +// the read. The register allocator does not understand that the kill is +// per-workitem, so is kept alive for the whole loop so we end up not re-using a +// subregister from it, using 1 more VGPR than necessary. This was saved when +// this was expanded after register allocation. +static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, + MachineBasicBlock &MBB, + MachineInstr &MI, + unsigned InitResultReg, + unsigned PhiReg, + int Offset, + bool UseGPRIdxMode) { + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(&MI); + + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); + + // Save the EXEC mask + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec) + .addReg(AMDGPU::EXEC); + + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(RemainderBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + + MBB.addSuccessor(LoopBB); + + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + + auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, + InitResultReg, DstReg, PhiReg, TmpExec, + Offset, UseGPRIdxMode); + + MachineBasicBlock::iterator First = RemainderBB->begin(); + BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(SaveExec); + + return InsPt; +} + +// Returns subreg index, offset +static std::pair<unsigned, int> +computeIndirectRegAndOffset(const SIRegisterInfo &TRI, + const TargetRegisterClass *SuperRC, + unsigned VecReg, + int Offset) { + int NumElts = SuperRC->getSize() / 4; + + // Skip out of bounds offsets, or else we would end up using an undefined + // register. + if (Offset >= NumElts || Offset < 0) + return std::make_pair(AMDGPU::sub0, Offset); + + return std::make_pair(AMDGPU::sub0 + Offset, 0); +} + +// Return true if the index is an SGPR and was set. +static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, + MachineRegisterInfo &MRI, + MachineInstr &MI, + int Offset, + bool UseGPRIdxMode, + bool IsIndirectSrc) { + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(&MI); + + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); + + assert(Idx->getReg() != AMDGPU::NoRegister); + + if (!TII->getRegisterInfo().isSGPRClass(IdxRC)) + return false; + + if (UseGPRIdxMode) { + unsigned IdxMode = IsIndirectSrc ? + VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; + if (Offset == 0) { + MachineInstr *SetOn = + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) + .addOperand(*Idx) + .addImm(IdxMode); + + SetOn->getOperand(3).setIsUndef(); + } else { + unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) + .addOperand(*Idx) + .addImm(Offset); + MachineInstr *SetOn = + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) + .addReg(Tmp, RegState::Kill) + .addImm(IdxMode); + + SetOn->getOperand(3).setIsUndef(); + } + + return true; + } + + if (Offset == 0) { + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addOperand(*Idx); + } else { + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addOperand(*Idx) + .addImm(Offset); + } + + return true; +} + +// Control flow needs to be inserted if indexing with a VGPR. +static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, + MachineBasicBlock &MBB, + const SISubtarget &ST) { + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); + int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); + + const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); + + unsigned SubReg; + std::tie(SubReg, Offset) + = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); + + bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; + + if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) { + MachineBasicBlock::iterator I(&MI); + const DebugLoc &DL = MI.getDebugLoc(); + + if (UseGPRIdxMode) { + // TODO: Look at the uses to avoid the copy. This may require rescheduling + // to avoid interfering with other uses, so probably requires a new + // optimization pass. + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) + .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, RegState::Implicit) + .addReg(AMDGPU::M0, RegState::Implicit); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); + } else { + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) + .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, RegState::Implicit); + } + + MI.eraseFromParent(); + + return &MBB; + } + + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(&MI); + + unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); + + if (UseGPRIdxMode) { + MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) + .addImm(0) // Reset inside loop. + .addImm(VGPRIndexMode::SRC0_ENABLE); + SetOn->getOperand(3).setIsUndef(); + + // Disable again after the loop. + BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); + } + + auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode); + MachineBasicBlock *LoopBB = InsPt->getParent(); + + if (UseGPRIdxMode) { + BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) + .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, RegState::Implicit) + .addReg(AMDGPU::M0, RegState::Implicit); + } else { + BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) + .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, RegState::Implicit); + } + + MI.eraseFromParent(); + + return LoopBB; +} + +static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) { + switch (VecRC->getSize()) { + case 4: + return AMDGPU::V_MOVRELD_B32_V1; + case 8: + return AMDGPU::V_MOVRELD_B32_V2; + case 16: + return AMDGPU::V_MOVRELD_B32_V4; + case 32: + return AMDGPU::V_MOVRELD_B32_V8; + case 64: + return AMDGPU::V_MOVRELD_B32_V16; + default: + llvm_unreachable("unsupported size for MOVRELD pseudos"); + } +} + +static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, + MachineBasicBlock &MBB, + const SISubtarget &ST) { + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + unsigned Dst = MI.getOperand(0).getReg(); + const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); + int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); + const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); + + // This can be an immediate, but will be folded later. + assert(Val->getReg()); + + unsigned SubReg; + std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, + SrcVec->getReg(), + Offset); + bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; + + if (Idx->getReg() == AMDGPU::NoRegister) { + MachineBasicBlock::iterator I(&MI); + const DebugLoc &DL = MI.getDebugLoc(); + + assert(Offset == 0); + + BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) + .addOperand(*SrcVec) + .addOperand(*Val) + .addImm(SubReg); + + MI.eraseFromParent(); + return &MBB; + } + + if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) { + MachineBasicBlock::iterator I(&MI); + const DebugLoc &DL = MI.getDebugLoc(); + + if (UseGPRIdxMode) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) + .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst + .addOperand(*Val) + .addReg(Dst, RegState::ImplicitDefine) + .addReg(SrcVec->getReg(), RegState::Implicit) + .addReg(AMDGPU::M0, RegState::Implicit); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); + } else { + const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); + + BuildMI(MBB, I, DL, MovRelDesc) + .addReg(Dst, RegState::Define) + .addReg(SrcVec->getReg()) + .addOperand(*Val) + .addImm(SubReg - AMDGPU::sub0); + } + + MI.eraseFromParent(); + return &MBB; + } + + if (Val->isReg()) + MRI.clearKillFlags(Val->getReg()); + + const DebugLoc &DL = MI.getDebugLoc(); + + if (UseGPRIdxMode) { + MachineBasicBlock::iterator I(&MI); + + MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) + .addImm(0) // Reset inside loop. + .addImm(VGPRIndexMode::DST_ENABLE); + SetOn->getOperand(3).setIsUndef(); + + // Disable again after the loop. + BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); + } + + unsigned PhiReg = MRI.createVirtualRegister(VecRC); + + auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, + Offset, UseGPRIdxMode); + MachineBasicBlock *LoopBB = InsPt->getParent(); + + if (UseGPRIdxMode) { + BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) + .addReg(PhiReg, RegState::Undef, SubReg) // vdst + .addOperand(*Val) // src0 + .addReg(Dst, RegState::ImplicitDefine) + .addReg(PhiReg, RegState::Implicit) + .addReg(AMDGPU::M0, RegState::Implicit); + } else { + const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); + + BuildMI(*LoopBB, InsPt, DL, MovRelDesc) + .addReg(Dst, RegState::Define) + .addReg(PhiReg) + .addOperand(*Val) + .addImm(SubReg - AMDGPU::sub0); + } + + MI.eraseFromParent(); + + return LoopBB; +} + MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { + + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + MachineFunction *MF = BB->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + + if (TII->isMIMG(MI)) { + if (!MI.memoperands_empty()) + return BB; + // Add a memoperand for mimg instructions so that they aren't assumed to + // be ordered memory instuctions. + + MachinePointerInfo PtrInfo(MFI->getImagePSV()); + MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable; + if (MI.mayStore()) + Flags |= MachineMemOperand::MOStore; + + if (MI.mayLoad()) + Flags |= MachineMemOperand::MOLoad; + + auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0); + MI.addMemOperand(*MF, MMO); + return BB; + } + switch (MI.getOpcode()) { case AMDGPU::SI_INIT_M0: { - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addOperand(MI.getOperand(0)); + .addOperand(MI.getOperand(0)); MI.eraseFromParent(); - break; - } - case AMDGPU::BRANCH: return BB; + } case AMDGPU::GET_GROUPSTATICSIZE: { - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - - MachineFunction *MF = BB->getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) .addOperand(MI.getOperand(0)) - .addImm(MFI->LDSSize); + .addImm(MFI->getLDSSize()); MI.eraseFromParent(); return BB; } + case AMDGPU::SI_INDIRECT_SRC_V1: + case AMDGPU::SI_INDIRECT_SRC_V2: + case AMDGPU::SI_INDIRECT_SRC_V4: + case AMDGPU::SI_INDIRECT_SRC_V8: + case AMDGPU::SI_INDIRECT_SRC_V16: + return emitIndirectSrc(MI, *BB, *getSubtarget()); + case AMDGPU::SI_INDIRECT_DST_V1: + case AMDGPU::SI_INDIRECT_DST_V2: + case AMDGPU::SI_INDIRECT_DST_V4: + case AMDGPU::SI_INDIRECT_DST_V8: + case AMDGPU::SI_INDIRECT_DST_V16: + return emitIndirectDst(MI, *BB, *getSubtarget()); case AMDGPU::SI_KILL: return splitKillBlock(MI, BB); + case AMDGPU::V_CNDMASK_B64_PSEUDO: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src0 = MI.getOperand(1).getReg(); + unsigned Src1 = MI.getOperand(2).getReg(); + const DebugLoc &DL = MI.getDebugLoc(); + unsigned SrcCond = MI.getOperand(3).getReg(); + + unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) + .addReg(Src0, 0, AMDGPU::sub0) + .addReg(Src1, 0, AMDGPU::sub0) + .addReg(SrcCond); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) + .addReg(Src0, 0, AMDGPU::sub1) + .addReg(Src1, 0, AMDGPU::sub1) + .addReg(SrcCond); + + BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) + .addReg(DstLo) + .addImm(AMDGPU::sub0) + .addReg(DstHi) + .addImm(AMDGPU::sub1); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::SI_BR_UNDEF: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) + .addOperand(MI.getOperand(0)); + Br->getOperand(1).setIsUndef(true); // read undef SCC + MI.eraseFromParent(); + return BB; + } default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } - return BB; } bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { @@ -1167,8 +1832,10 @@ EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); } -MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const { - return MVT::i32; +MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { + // TODO: Should i16 be used always if legal? For now it would force VALU + // shifts. + return (VT == MVT::i16) ? MVT::i16 : MVT::i32; } // Answering this is somewhat tricky and depends on the specific device which @@ -1201,6 +1868,8 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); case MVT::f64: return true; + case MVT::f16: + return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals(); default: break; } @@ -1215,7 +1884,6 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { SDValue Result = LowerLOAD(Op, DAG); @@ -1242,6 +1910,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); case ISD::TRAP: return lowerTRAP(Op, DAG); + case ISD::FP_ROUND: + return lowerFP_ROUND(Op, DAG); } return SDValue(); } @@ -1262,58 +1932,31 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { return nullptr; } -SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { - - SDLoc SL(Op); - FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); - unsigned FrameIndex = FINode->getIndex(); - - // A FrameIndex node represents a 32-bit offset into scratch memory. If the - // high bit of a frame index offset were to be set, this would mean that it - // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch - // buffer, with 64 being the number of threads per wave. - // - // The maximum private allocation for the entire GPU is 4G, and we are - // concerned with the largest the index could ever be for an individual - // workitem. This will occur with the minmum dispatch size. If a program - // requires more, the dispatch size will be reduced. - // - // With this limit, we can mark the high bit of the FrameIndex node as known - // zero, which is important, because it means in most situations we can prove - // that values derived from FrameIndex nodes are non-negative. This enables us - // to take advantage of more addressing modes when accessing scratch buffers, - // since for scratch reads/writes, the register offset must always be - // positive. - - uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024; - - // XXX - It is unclear if partial dispatch works. Assume it works at half wave - // granularity. It is probably a full wave. - uint64_t MinGranularity = 32; - - unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity); - EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits); - - SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); - return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, - DAG.getValueType(ExtVT)); -} - bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { - if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) - return false; + if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { + switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { + case AMDGPUIntrinsic::amdgcn_if: + case AMDGPUIntrinsic::amdgcn_else: + case AMDGPUIntrinsic::amdgcn_end_cf: + case AMDGPUIntrinsic::amdgcn_loop: + return true; + default: + return false; + } + } - switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { - default: return false; - case AMDGPUIntrinsic::amdgcn_if: - case AMDGPUIntrinsic::amdgcn_else: - case AMDGPUIntrinsic::amdgcn_break: - case AMDGPUIntrinsic::amdgcn_if_break: - case AMDGPUIntrinsic::amdgcn_else_break: - case AMDGPUIntrinsic::amdgcn_loop: - case AMDGPUIntrinsic::amdgcn_end_cf: - return true; + if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) { + switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) { + case AMDGPUIntrinsic::amdgcn_break: + case AMDGPUIntrinsic::amdgcn_if_break: + case AMDGPUIntrinsic::amdgcn_else_break: + return true; + default: + return false; + } } + + return false; } void SITargetLowering::createDebuggerPrologueStackObjects( @@ -1334,14 +1977,31 @@ void SITargetLowering::createDebuggerPrologueStackObjects( // For each dimension: for (unsigned i = 0; i < 3; ++i) { // Create fixed stack object for work group ID. - ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true); + ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true); Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); // Create fixed stack object for work item ID. - ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true); + ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true); Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); } } +bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { + const Triple &TT = getTargetMachine().getTargetTriple(); + return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + AMDGPU::shouldEmitConstantsToTextSection(TT); +} + +bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { + return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) && + !shouldEmitFixup(GV) && + !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); +} + +bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { + return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, @@ -1365,30 +2025,50 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, Target = BR->getOperand(1); } + // FIXME: This changes the types of the intrinsics instead of introducing new + // nodes with the correct types. + // e.g. llvm.amdgcn.loop + + // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3 + // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088> + if (!isCFIntrinsic(Intr)) { // This is a uniform branch so we don't need to legalize. return BRCOND; } + bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID || + Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; + assert(!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)); - // Build the result and - ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); - // operands of the new intrinsic call SmallVector<SDValue, 4> Ops; - Ops.push_back(BRCOND.getOperand(0)); - Ops.append(Intr->op_begin() + 1, Intr->op_end()); + if (HaveChain) + Ops.push_back(BRCOND.getOperand(0)); + + Ops.append(Intr->op_begin() + (HaveChain ? 1 : 0), Intr->op_end()); Ops.push_back(Target); + ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); + // build the new intrinsic call SDNode *Result = DAG.getNode( Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, DAG.getVTList(Res), Ops).getNode(); + if (!HaveChain) { + SDValue Ops[] = { + SDValue(Result, 0), + BRCOND.getOperand(0) + }; + + Result = DAG.getMergeValues(Ops, DL).getNode(); + } + if (BR) { // Give the branch instruction our target SDValue Ops[] = { @@ -1425,6 +2105,31 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } +SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, + SDValue Op, + const SDLoc &DL, + EVT VT) const { + return Op.getValueType().bitsLE(VT) ? + DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) : + DAG.getNode(ISD::FTRUNC, DL, VT, Op); +} + +SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getValueType() == MVT::f16 && + "Do not know how to custom lower FP_ROUND for non-f16 type"); + + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + if (SrcVT != MVT::f64) + return Op; + + SDLoc DL(Op); + + SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); + return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);; +} + SDValue SITargetLowering::getSegmentAperture(unsigned AS, SelectionDAG &DAG) const { SDLoc SL; @@ -1452,7 +2157,8 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, MachinePointerInfo PtrInfo(V, StructOffset); return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo, MinAlign(64, StructOffset), - MachineMemOperand::MOInvariant); + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); } SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, @@ -1505,17 +2211,12 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, return DAG.getUNDEF(ASC->getValueType(0)); } -static bool shouldEmitGOTReloc(const GlobalValue *GV, - const TargetMachine &TM) { - return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && - !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); -} - bool SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // We can fold offsets for anything that doesn't require a GOT relocation. - return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && - !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine()); + return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) && + !shouldEmitGOTReloc(GA->getGlobal()); } static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, @@ -1523,14 +2224,27 @@ static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, unsigned GAFlags = SIInstrInfo::MO_NONE) { // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is // lowered to the following code sequence: - // s_getpc_b64 s[0:1] - // s_add_u32 s0, s0, $symbol - // s_addc_u32 s1, s1, 0 // - // s_getpc_b64 returns the address of the s_add_u32 instruction and then - // a fixup or relocation is emitted to replace $symbol with a literal - // constant, which is a pc-relative offset from the encoding of the $symbol - // operand to the global variable. + // For constant address space: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol + // s_addc_u32 s1, s1, 0 + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // a fixup or relocation is emitted to replace $symbol with a literal + // constant, which is a pc-relative offset from the encoding of the $symbol + // operand to the global variable. + // + // For global address space: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo + // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // fixups or relocations are emitted to replace $symbol@*@lo and + // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, + // which is a 64-bit pc-relative offset from the encoding of the $symbol + // operand to the global variable. // // What we want here is an offset from the value returned by s_getpc // (which is the address of the s_add_u32 instruction) to the global @@ -1538,9 +2252,12 @@ static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too // small. This requires us to add 4 to the global variable offset in order to // compute the correct address. - SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, - GAFlags); - return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA); + SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, + GAFlags); + SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, + GAFlags == SIInstrInfo::MO_NONE ? + GAFlags : GAFlags + 1); + return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); } SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, @@ -1556,11 +2273,14 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, const GlobalValue *GV = GSD->getGlobal(); EVT PtrVT = Op.getValueType(); - if (!shouldEmitGOTReloc(GV, getTargetMachine())) + if (shouldEmitFixup(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); + else if (shouldEmitPCReloc(GV)) + return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT, + SIInstrInfo::MO_REL32); SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, - SIInstrInfo::MO_GOTPCREL); + SIInstrInfo::MO_GOTPCREL32); Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); @@ -1570,7 +2290,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align, - MachineMemOperand::MOInvariant); + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); } SDValue SITargetLowering::lowerTRAP(SDValue Op, @@ -1647,9 +2368,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // TODO: Should this propagate fast-math-flags? switch (IntrinsicID) { + case Intrinsic::amdgcn_implicit_buffer_ptr: { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + } case Intrinsic::amdgcn_dispatch_ptr: case Intrinsic::amdgcn_queue_ptr: { - if (!Subtarget->isAmdHsaOS()) { + if (!Subtarget->isAmdCodeObjectV2(MF)) { DiagnosticInfoUnsupported BadIntrin( *MF.getFunction(), "unsupported hsa intrinsic without hsa target", DL.getDebugLoc()); @@ -1671,6 +2396,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); } + case Intrinsic::amdgcn_dispatch_id: { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID); + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + } case Intrinsic::amdgcn_rcp: return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_rsq: @@ -1682,6 +2411,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); } + case Intrinsic::amdgcn_rcp_legacy: { + if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) + return emitRemovedIntrinsicError(DAG, DL, VT); + return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1)); + } case Intrinsic::amdgcn_rsq_clamp: { if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); @@ -1750,22 +2484,17 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Z); - case Intrinsic::amdgcn_read_workdim: - case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name. - // Really only 2 bits. - return lowerImplicitZextParam(DAG, Op, MVT::i8, - getImplicitParameterOffset(MFI, GRID_DIM)); case Intrinsic::amdgcn_workgroup_id_x: case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); case Intrinsic::amdgcn_workgroup_id_y: case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); case Intrinsic::amdgcn_workgroup_id_z: case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::r600_read_tidig_x: @@ -1786,9 +2515,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, }; MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, - VT.getStoreSize(), 4); + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + VT.getStoreSize(), 4); return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, Op->getVTList(), Ops, VT, MMO); } @@ -1818,6 +2548,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getConstant(0, DL, MVT::i32)); SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, DAG.getConstant(1, DL, MVT::i32)); + I = DAG.getNode(ISD::BITCAST, DL, MVT::f32, I); + J = DAG.getNode(ISD::BITCAST, DL, MVT::f32, J); SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); SDValue Glue = M0.getValue(1); SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, @@ -1827,6 +2559,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, Op.getOperand(1), Op.getOperand(2), Glue); } + case Intrinsic::amdgcn_interp_mov: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); + SDValue Glue = M0.getValue(1); + return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Glue); + } case Intrinsic::amdgcn_interp_p1: { SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); SDValue Glue = M0.getValue(1); @@ -1899,6 +2637,38 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, Denominator, Numerator); } + case Intrinsic::amdgcn_icmp: { + const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); + int CondCode = CD->getSExtValue(); + + if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || + CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE) + return DAG.getUNDEF(VT); + + ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); + ISD::CondCode CCOpcode = getICmpCondCode(IcInput); + return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), + Op.getOperand(2), DAG.getCondCode(CCOpcode)); + } + case Intrinsic::amdgcn_fcmp: { + const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); + int CondCode = CD->getSExtValue(); + + if (CondCode <= FCmpInst::Predicate::FCMP_FALSE || + CondCode >= FCmpInst::Predicate::FCMP_TRUE) + return DAG.getUNDEF(VT); + + FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); + ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); + return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), + Op.getOperand(2), DAG.getCondCode(CCOpcode)); + } + case Intrinsic::amdgcn_fmul_legacy: + return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_sffbh: + case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name. + return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } @@ -1907,6 +2677,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + SDLoc DL(Op); switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: { @@ -1922,6 +2693,31 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_format: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // offset + Op.getOperand(5), // glc + Op.getOperand(6) // slc + }; + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? + AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; + EVT VT = Op.getValueType(); + EVT IntVT = VT.changeTypeToInteger(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(MFI->getBufferPSV()), + MachineMemOperand::MOLoad, + VT.getStoreSize(), VT.getStoreSize()); + + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); + } default: return SDValue(); } @@ -1935,12 +2731,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); switch (IntrinsicID) { - case AMDGPUIntrinsic::SI_sendmsg: { + case AMDGPUIntrinsic::SI_sendmsg: + case Intrinsic::amdgcn_s_sendmsg: { Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); SDValue Glue = Chain.getValue(1); return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, Op.getOperand(2), Glue); } + case Intrinsic::amdgcn_s_sendmsghalt: { + Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); + SDValue Glue = Chain.getValue(1); + return DAG.getNode(AMDGPUISD::SENDMSGHALT, DL, MVT::Other, Chain, + Op.getOperand(2), Glue); + } case AMDGPUIntrinsic::SI_tbuffer_store: { SDValue Ops[] = { Chain, @@ -1969,12 +2772,40 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op->getVTList(), Ops, VT, MMO); } case AMDGPUIntrinsic::AMDGPU_kill: { - if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Op.getOperand(2))) { + SDValue Src = Op.getOperand(2); + if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) { if (!K->isNegative()) return Chain; + + SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32); + return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne); } - return Op; + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); + return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast); + } + case AMDGPUIntrinsic::SI_export: { + const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2)); + const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3)); + const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4)); + const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5)); + const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6)); + + const SDValue Ops[] = { + Chain, + DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), + DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1), + DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), + DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1), + Op.getOperand(7), // src0 + Op.getOperand(8), // src1 + Op.getOperand(9), // src2 + Op.getOperand(10) // src3 + }; + + unsigned Opc = Done->isNullValue() ? + AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; + return DAG.getNode(Opc, DL, Op->getVTList(), Ops); } default: return SDValue(); @@ -1988,7 +2819,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT MemVT = Load->getMemoryVT(); if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { - assert(MemVT == MVT::i1 && "Only i1 non-extloads expected"); // FIXME: Copied from PPC // First, load into 32 bits, then truncate to 1 bit. @@ -1996,8 +2826,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue BasePtr = Load->getBasePtr(); MachineMemOperand *MMO = Load->getMemOperand(); + EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, - BasePtr, MVT::i8, MMO); + BasePtr, RealMemVT, MMO); SDValue Ops[] = { DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), @@ -2021,17 +2853,34 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(Ops, DL); } + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + // If there is a possibilty that flat instruction access scratch memory + // then we need to use the same legalization rules we use for private. + if (AS == AMDGPUAS::FLAT_ADDRESS) + AS = MFI->hasFlatScratchInit() ? + AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; + unsigned NumElements = MemVT.getVectorNumElements(); switch (AS) { case AMDGPUAS::CONSTANT_ADDRESS: if (isMemOpUniform(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they - // have the same legalization requires ments as global and private + // have the same legalization requirements as global and private // loads. // - // Fall-through - case AMDGPUAS::GLOBAL_ADDRESS: + LLVM_FALLTHROUGH; + case AMDGPUAS::GLOBAL_ADDRESS: { + if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && + isMemOpHasNoClobberedMemOperand(Load)) + return SDValue(); + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requirements as global and private + // loads. + // + } + LLVM_FALLTHROUGH; case AMDGPUAS::FLAT_ADDRESS: if (NumElements > 4) return SplitVectorLoad(Op, DAG); @@ -2110,22 +2959,33 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { - if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && - CLHS->isExactlyValue(1.0)) { - // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to - // the CI documentation has a worst case error of 1 ulp. - // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to - // use it as long as we aren't trying to use denormals. - - // 1.0 / sqrt(x) -> rsq(x) - // - // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP - // error seems really high at 2^29 ULP. - if (RHS.getOpcode() == ISD::FSQRT) - return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); - - // 1.0 / x -> rcp(x) - return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || + VT == MVT::f16) { + if (CLHS->isExactlyValue(1.0)) { + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + // + // v_rcp_f16 and v_rsq_f16 DO support denormals. + + // 1.0 / sqrt(x) -> rsq(x) + + // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // error seems really high at 2^29 ULP. + if (RHS.getOpcode() == ISD::FSQRT) + return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); + + // 1.0 / x -> rcp(x) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + } + + // Same as for 1.0, but expand the sign out of the constant. + if (CLHS->isExactlyValue(-1.0)) { + // -1.0 / x -> rcp (fneg x) + SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); + } } } @@ -2143,6 +3003,67 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, return SDValue(); } +static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, + EVT VT, SDValue A, SDValue B, SDValue GlueChain) { + if (GlueChain->getNumValues() <= 1) { + return DAG.getNode(Opcode, SL, VT, A, B); + } + + assert(GlueChain->getNumValues() == 3); + + SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); + switch (Opcode) { + default: llvm_unreachable("no chain equivalent for opcode"); + case ISD::FMUL: + Opcode = AMDGPUISD::FMUL_W_CHAIN; + break; + } + + return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, + GlueChain.getValue(2)); +} + +static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, + EVT VT, SDValue A, SDValue B, SDValue C, + SDValue GlueChain) { + if (GlueChain->getNumValues() <= 1) { + return DAG.getNode(Opcode, SL, VT, A, B, C); + } + + assert(GlueChain->getNumValues() == 3); + + SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); + switch (Opcode) { + default: llvm_unreachable("no chain equivalent for opcode"); + case ISD::FMA: + Opcode = AMDGPUISD::FMA_W_CHAIN; + break; + } + + return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C, + GlueChain.getValue(2)); +} + +SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { + if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) + return FastLowered; + + SDLoc SL(Op); + SDValue Src0 = Op.getOperand(0); + SDValue Src1 = Op.getOperand(1); + + SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); + SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); + + SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1); + SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1); + + SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32); + SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0); +} + // Faster 2.5 ULP division that does not support denormals. SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -2189,25 +3110,73 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); - SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS); - SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); + SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, + RHS, RHS, LHS); + SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, + LHS, RHS, LHS); // Denominator is scaled to not be denormal, so using rcp is ok. - SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled); + SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, + DenominatorScaled); + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, + DenominatorScaled); + + const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | + (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | + (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); + + const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); + + if (!Subtarget->hasFP32Denormals()) { + SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, + SL, MVT::i32); + SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, + DAG.getEntryNode(), + EnableDenormValue, BitField); + SDValue Ops[3] = { + NegDivScale0, + EnableDenorm.getValue(0), + EnableDenorm.getValue(1) + }; + + NegDivScale0 = DAG.getMergeValues(Ops, SL); + } - SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled); + SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, + ApproxRcp, One, NegDivScale0); - SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One); - SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp); + SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, + ApproxRcp, Fma0); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1); + SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, + Fma1, Fma1); - SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled); - SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul); - SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled); + SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, + NumeratorScaled, Mul); + + SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2); + + SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, + NumeratorScaled, Fma3); + + if (!Subtarget->hasFP32Denormals()) { + const SDValue DisableDenormValue = + DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); + SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, + Fma4.getValue(1), + DisableDenormValue, + BitField, + Fma4.getValue(2)); + + SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + DisableDenorm, DAG.getRoot()); + DAG.setRoot(OutputChain); + } SDValue Scale = NumeratorScaled.getValue(1); - SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale); + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, + Fma4, Fma1, Fma3, Scale); return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); } @@ -2288,6 +3257,9 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::f64) return LowerFDIV64(Op, DAG); + if (VT == MVT::f16) + return LowerFDIV16(Op, DAG); + llvm_unreachable("Unexpected type for fdiv"); } @@ -2311,6 +3283,14 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return expandUnalignedStore(Store, DAG); } + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + // If there is a possibilty that flat instruction access scratch memory + // then we need to use the same legalization rules we use for private. + if (AS == AMDGPUAS::FLAT_ADDRESS) + AS = MFI->hasFlatScratchInit() ? + AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; + unsigned NumElements = VT.getVectorNumElements(); switch (AS) { case AMDGPUAS::GLOBAL_ADDRESS: @@ -2504,23 +3484,83 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); } +SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, + DAGCombinerInfo &DCI) const { + SDValue Ptr = N->getBasePtr(); + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + // TODO: We could also do this for multiplies. + unsigned AS = N->getAddressSpace(); + if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); + if (NewPtr) { + SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end()); + + NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); + } + } + + return SDValue(); +} + +static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) { + return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) || + (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) || + (Opc == ISD::XOR && Val == 0); +} + +// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This +// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit +// integer combine opportunities since most 64-bit operations are decomposed +// this way. TODO: We won't want this for SALU especially if it is an inline +// immediate. +SDValue SITargetLowering::splitBinaryBitConstantOp( + DAGCombinerInfo &DCI, + const SDLoc &SL, + unsigned Opc, SDValue LHS, + const ConstantSDNode *CRHS) const { + uint64_t Val = CRHS->getZExtValue(); + uint32_t ValLo = Lo_32(Val); + uint32_t ValHi = Hi_32(Val); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + if ((bitOpWithConstantIsReducible(Opc, ValLo) || + bitOpWithConstantIsReducible(Opc, ValHi)) || + (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { + // If we need to materialize a 64-bit immediate, it will be split up later + // anyway. Avoid creating the harder to understand 64-bit immediate + // materialization. + return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi); + } + + return SDValue(); +} + SDValue SITargetLowering::performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.isBeforeLegalize()) return SDValue(); - if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI)) - return Base; - SelectionDAG &DAG = DCI.DAG; - - // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> - // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) + EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - if (LHS.getOpcode() == ISD::SETCC && - RHS.getOpcode() == ISD::SETCC) { + + if (VT == MVT::i64) { + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); + if (CRHS) { + if (SDValue Split + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) + return Split; + } + } + + // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> + // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) + if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) { ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); @@ -2568,54 +3608,85 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); - if (VT == MVT::i64) { - // TODO: This could be a generic combine with a predicate for extracting the - // high half of an integer being free. - - // (or i64:x, (zero_extend i32:y)) -> - // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) - if (LHS.getOpcode() == ISD::ZERO_EXTEND && - RHS.getOpcode() != ISD::ZERO_EXTEND) - std::swap(LHS, RHS); - - if (RHS.getOpcode() == ISD::ZERO_EXTEND) { - SDValue ExtSrc = RHS.getOperand(0); - EVT SrcVT = ExtSrc.getValueType(); - if (SrcVT == MVT::i32) { - SDLoc SL(N); - SDValue LowLHS, HiBits; - std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); - SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); - - DCI.AddToWorklist(LowOr.getNode()); - DCI.AddToWorklist(HiBits.getNode()); - - SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, - LowOr, HiBits); - return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); - } + if (VT == MVT::i1) { + // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) + if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && + RHS.getOpcode() == AMDGPUISD::FP_CLASS) { + SDValue Src = LHS.getOperand(0); + if (Src != RHS.getOperand(0)) + return SDValue(); + + const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); + if (!CLHS || !CRHS) + return SDValue(); + + // Only 10 bits are used. + static const uint32_t MaxMask = 0x3ff; + + uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, + Src, DAG.getConstant(NewMask, DL, MVT::i32)); } + + return SDValue(); } - // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) - if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && - RHS.getOpcode() == AMDGPUISD::FP_CLASS) { - SDValue Src = LHS.getOperand(0); - if (Src != RHS.getOperand(0)) - return SDValue(); + if (VT != MVT::i64) + return SDValue(); - const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); - const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); - if (!CLHS || !CRHS) - return SDValue(); + // TODO: This could be a generic combine with a predicate for extracting the + // high half of an integer being free. + + // (or i64:x, (zero_extend i32:y)) -> + // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) + if (LHS.getOpcode() == ISD::ZERO_EXTEND && + RHS.getOpcode() != ISD::ZERO_EXTEND) + std::swap(LHS, RHS); + + if (RHS.getOpcode() == ISD::ZERO_EXTEND) { + SDValue ExtSrc = RHS.getOperand(0); + EVT SrcVT = ExtSrc.getValueType(); + if (SrcVT == MVT::i32) { + SDLoc SL(N); + SDValue LowLHS, HiBits; + std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); + SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); + + DCI.AddToWorklist(LowOr.getNode()); + DCI.AddToWorklist(HiBits.getNode()); + + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + LowOr, HiBits); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); + } + } - // Only 10 bits are used. - static const uint32_t MaxMask = 0x3ff; + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (CRHS) { + if (SDValue Split + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS)) + return Split; + } + + return SDValue(); +} + +SDValue SITargetLowering::performXorCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + if (VT != MVT::i64) + return SDValue(); - uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; - SDLoc DL(N); - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, - Src, DAG.getConstant(NewMask, DL, MVT::i32)); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); + if (CRHS) { + if (SDValue Split + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) + return Split; } return SDValue(); @@ -2657,6 +3728,9 @@ SDValue SITargetLowering::performFCanonicalizeCombine( if (VT == MVT::f64 && !Subtarget->hasFP64Denormals()) return DAG.getConstantFP(0.0, SDLoc(N), VT); + + if (VT == MVT::f16 && !Subtarget->hasFP16Denormals()) + return DAG.getConstantFP(0.0, SDLoc(N), VT); } if (C.isNaN()) { @@ -2716,8 +3790,23 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, } EVT VT = K0->getValueType(0); - return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, - Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + + MVT NVT = MVT::i32; + unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + + SDValue Tmp1, Tmp2, Tmp3; + Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); + Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); + Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); + + if (VT == MVT::i16) { + Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT, + Tmp1, Tmp2, Tmp3); + + return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1); + } else + return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); } static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { @@ -2814,6 +3903,119 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, return SDValue(); } +unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, + const SDNode *N0, + const SDNode *N1) const { + EVT VT = N0->getValueType(0); + + // Only do this if we are not trying to support denormals. v_mad_f32 does not + // support denormals ever. + if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || + (VT == MVT::f16 && !Subtarget->hasFP16Denormals())) + return ISD::FMAD; + + const TargetOptions &Options = DAG.getTarget().Options; + if ((Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath || + (cast<BinaryWithFlagsSDNode>(N0)->Flags.hasUnsafeAlgebra() && + cast<BinaryWithFlagsSDNode>(N1)->Flags.hasUnsafeAlgebra())) && + isFMAFasterThanFMulAndFAdd(VT)) { + return ISD::FMA; + } + + return 0; +} + +SDValue SITargetLowering::performFAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + assert(!VT.isVector()); + + SDLoc SL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // These should really be instruction patterns, but writing patterns with + // source modiifiers is a pain. + + // fadd (fadd (a, a), b) -> mad 2.0, a, b + if (LHS.getOpcode() == ISD::FADD) { + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); + if (FusedOp != 0) { + const SDValue Two = DAG.getConstantFP(2.0, SL, VT); + return DAG.getNode(FusedOp, SL, VT, A, Two, RHS); + } + } + } + + // fadd (b, fadd (a, a)) -> mad 2.0, a, b + if (RHS.getOpcode() == ISD::FADD) { + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); + if (FusedOp != 0) { + const SDValue Two = DAG.getConstantFP(2.0, SL, VT); + return DAG.getNode(FusedOp, SL, VT, A, Two, LHS); + } + } + } + + return SDValue(); +} + +SDValue SITargetLowering::performFSubCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + EVT VT = N->getValueType(0); + assert(!VT.isVector()); + + // Try to get the fneg to fold into the source modifier. This undoes generic + // DAG combines and folds them into the mad. + // + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS.getOpcode() == ISD::FADD) { + // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); + if (FusedOp != 0){ + const SDValue Two = DAG.getConstantFP(2.0, SL, VT); + SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + + return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS); + } + } + } + + if (RHS.getOpcode() == ISD::FADD) { + // (fsub c, (fadd a, a)) -> mad -2.0, a, c + + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); + if (FusedOp != 0){ + const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT); + return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS); + } + } + } + + return SDValue(); +} + SDValue SITargetLowering::performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2823,7 +4025,8 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, SDValue RHS = N->getOperand(1); EVT VT = LHS.getValueType(); - if (VT != MVT::f32 && VT != MVT::f64) + if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() && + VT != MVT::f16)) return SDValue(); // Match isinf pattern @@ -2845,14 +4048,59 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, return SDValue(); } -SDValue SITargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { +SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); + SDLoc SL(N); + unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; + + SDValue Src = N->getOperand(0); + SDValue Srl = N->getOperand(0); + if (Srl.getOpcode() == ISD::ZERO_EXTEND) + Srl = Srl.getOperand(0); + + // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. + if (Srl.getOpcode() == ISD::SRL) { + // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x + // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x + // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x + + if (const ConstantSDNode *C = + dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { + Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)), + EVT(MVT::i32)); + + unsigned SrcOffset = C->getZExtValue() + 8 * Offset; + if (SrcOffset < 32 && SrcOffset % 8 == 0) { + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL, + MVT::f32, Srl); + } + } + } + + APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(Src, Demanded) || + TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + + return SDValue(); +} + +SDValue SITargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + case ISD::FADD: + return performFAddCombine(N, DCI); + case ISD::FSUB: + return performFSubCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); case ISD::FMAXNUM: @@ -2869,127 +4117,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performMinMaxCombine(N, DCI); break; } - - case AMDGPUISD::CVT_F32_UBYTE0: - case AMDGPUISD::CVT_F32_UBYTE1: - case AMDGPUISD::CVT_F32_UBYTE2: - case AMDGPUISD::CVT_F32_UBYTE3: { - unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; - SDValue Src = N->getOperand(0); - - // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. - if (Src.getOpcode() == ISD::SRL) { - // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x - // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x - // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x - - if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) { - unsigned SrcOffset = C->getZExtValue() + 8 * Offset; - if (SrcOffset < 32 && SrcOffset % 8 == 0) { - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL, - MVT::f32, Src.getOperand(0)); - } - } - } - - APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); - - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), - !DCI.isBeforeLegalizeOps()); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(Src, Demanded) || - TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { - DCI.CommitTargetLoweringOpt(TLO); - } - - break; - } - - case ISD::UINT_TO_FP: { - return performUCharToFloatCombine(N, DCI); - } - case ISD::FADD: { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) - break; - - EVT VT = N->getValueType(0); - if (VT != MVT::f32) - break; - - // Only do this if we are not trying to support denormals. v_mad_f32 does - // not support denormals ever. - if (Subtarget->hasFP32Denormals()) - break; - - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - // These should really be instruction patterns, but writing patterns with - // source modiifiers is a pain. - - // fadd (fadd (a, a), b) -> mad 2.0, a, b - if (LHS.getOpcode() == ISD::FADD) { - SDValue A = LHS.getOperand(0); - if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); - return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); - } - } - - // fadd (b, fadd (a, a)) -> mad 2.0, a, b - if (RHS.getOpcode() == ISD::FADD) { - SDValue A = RHS.getOperand(0); - if (A == RHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); - return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); - } - } - - return SDValue(); - } - case ISD::FSUB: { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) - break; - - EVT VT = N->getValueType(0); - - // Try to get the fneg to fold into the source modifier. This undoes generic - // DAG combines and folds them into the mad. - // - // Only do this if we are not trying to support denormals. v_mad_f32 does - // not support denormals ever. - if (VT == MVT::f32 && - !Subtarget->hasFP32Denormals()) { - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if (LHS.getOpcode() == ISD::FADD) { - // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) - - SDValue A = LHS.getOperand(0); - if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); - SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); - - return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); - } - } - - if (RHS.getOpcode() == ISD::FADD) { - // (fsub c, (fadd a, a)) -> mad -2.0, a, c - - SDValue A = RHS.getOperand(0); - if (A == RHS.getOperand(1)) { - const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); - return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); - } - } - - return SDValue(); - } - - break; - } case ISD::LOAD: case ISD::STORE: case ISD::ATOMIC_LOAD: @@ -3011,27 +4138,14 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; - - MemSDNode *MemNode = cast<MemSDNode>(N); - SDValue Ptr = MemNode->getBasePtr(); - - // TODO: We could also do this for multiplies. - unsigned AS = MemNode->getAddressSpace(); - if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { - SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); - if (NewPtr) { - SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); - - NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; - return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); - } - } - break; + return performMemSDNodeCombine(cast<MemSDNode>(N), DCI); } case ISD::AND: return performAndCombine(N, DCI); case ISD::OR: return performOrCombine(N, DCI); + case ISD::XOR: + return performXorCombine(N, DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); case ISD::FCANONICALIZE: @@ -3039,6 +4153,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::FRACT: case AMDGPUISD::RCP: case AMDGPUISD::RSQ: + case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::RSQ_LEGACY: case AMDGPUISD::RSQ_CLAMP: case AMDGPUISD::LDEXP: { @@ -3047,38 +4162,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return Src; break; } + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + return performUCharToFloatCombine(N, DCI); + case AMDGPUISD::CVT_F32_UBYTE0: + case AMDGPUISD::CVT_F32_UBYTE1: + case AMDGPUISD::CVT_F32_UBYTE2: + case AMDGPUISD::CVT_F32_UBYTE3: + return performCvtF32UByteNCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } -/// \brief Analyze the possible immediate value Op -/// -/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate -/// and the immediate value if it's a literal immediate -int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - - if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { - if (TII->isInlineConstant(Node->getAPIntValue())) - return 0; - - uint64_t Val = Node->getZExtValue(); - return isUInt<32>(Val) ? Val : -1; - } - - if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { - if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) - return 0; - - if (Node->getValueType(0) == MVT::f32) - return FloatToBits(Node->getValueAPF().convertToFloat()); - - return -1; - } - - return -1; -} - /// \brief Helper function for adjustWritemask static unsigned SubIdx2Lane(unsigned Idx) { switch (Idx) { @@ -3235,13 +4330,16 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, if (TII->isMIMG(MI)) { unsigned VReg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(VReg); + // TODO: Need mapping tables to handle other cases (register classes). + if (RC != &AMDGPU::VReg_128RegClass) + return; + unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); unsigned BitsSet = 0; for (unsigned i = 0; i < 4; ++i) BitsSet += Writemask & (1 << i) ? 1 : 0; - - const TargetRegisterClass *RC; switch (BitsSet) { default: return; case 1: RC = &AMDGPU::VGPR_32RegClass; break; @@ -3379,6 +4477,8 @@ std::pair<unsigned, const TargetRegisterClass *> SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { + if (!isTypeLegal(VT)) + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); if (Constraint.size() == 1) { switch (Constraint[0]) { @@ -3388,7 +4488,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, default: return std::make_pair(0U, nullptr); case 32: - return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); + case 16: + return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass); case 64: return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); case 128: @@ -3402,6 +4503,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, default: return std::make_pair(0U, nullptr); case 32: + case 16: return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); case 64: return std::make_pair(0U, &AMDGPU::VReg_64RegClass); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h index 1d349fa..6c04e4f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -24,7 +24,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, unsigned Offset) const; SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, - SDValue Chain, unsigned Offset, bool Signed) const; + SDValue Chain, unsigned Offset, bool Signed, + const ISD::InputArg *Arg = nullptr) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, @@ -33,11 +34,11 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; @@ -47,6 +48,16 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + /// \brief Converts \p Op, which must be of floating point type, to the + /// floating point type \p VT, by either extending or truncating it. + SDValue getFPExtOrFPTrunc(SelectionDAG &DAG, + SDValue Op, + const SDLoc &DL, + EVT VT) const; + + /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16. + SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const; SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; @@ -58,14 +69,27 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performSHLPtrCombine(SDNode *N, unsigned AS, DAGCombinerInfo &DCI) const; + + SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const; + + SDValue splitBinaryBitConstantOp(DAGCombinerInfo &DCI, const SDLoc &SL, + unsigned Opc, SDValue LHS, + const ConstantSDNode *CRHS) const; + SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; + unsigned getFusedOpcode(const SelectionDAG &DAG, + const SDNode *N0, const SDNode *N1) const; + SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; @@ -73,6 +97,19 @@ class SITargetLowering final : public AMDGPUTargetLowering { bool isCFIntrinsic(const SDNode *Intr) const; void createDebuggerPrologueStackObjects(MachineFunction &MF) const; + + /// \returns True if fixup needs to be emitted for given global value \p GV, + /// false otherwise. + bool shouldEmitFixup(const GlobalValue *GV) const; + + /// \returns True if GOT relocation needs to be emitted for given global value + /// \p GV, false otherwise. + bool shouldEmitGOTReloc(const GlobalValue *GV) const; + + /// \returns True if PC-relative relocation needs to be emitted for given + /// global value \p GV, false otherwise. + bool shouldEmitPCReloc(const GlobalValue *GV) const; + public: SITargetLowering(const TargetMachine &tm, const SISubtarget &STI); @@ -98,7 +135,9 @@ public: MachineFunction &MF) const override; bool isMemOpUniform(const SDNode *N) const; + bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const; bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; + bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; @@ -141,7 +180,6 @@ public: void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override; - int32_t analyzeImmediate(const SDNode *N) const; SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const override; void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp new file mode 100644 index 0000000..91e4bf7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -0,0 +1,329 @@ +//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass inserts branches on the 0 exec mask over divergent branches +/// branches when it's expected that jumping over the untaken control flow will +/// be cheaper than having every workitem no-op through it. +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/MC/MCAsmInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-insert-skips" + +namespace { + +static cl::opt<unsigned> SkipThresholdFlag( + "amdgpu-skip-threshold", + cl::desc("Number of instructions before jumping over divergent control flow"), + cl::init(12), cl::Hidden); + +class SIInsertSkips : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + unsigned SkipThreshold; + + bool shouldSkip(const MachineBasicBlock &From, + const MachineBasicBlock &To) const; + + bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); + + void kill(MachineInstr &MI); + + MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); + +public: + static char ID; + + SIInsertSkips() : + MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI insert s_cbranch_execz instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace + +char SIInsertSkips::ID = 0; + +INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, + "SI insert s_cbranch_execz instructions", false, false) + +char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; + +static bool opcodeEmitsNoInsts(unsigned Opc) { + switch (Opc) { + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + case TargetOpcode::BUNDLE: + case TargetOpcode::CFI_INSTRUCTION: + case TargetOpcode::EH_LABEL: + case TargetOpcode::GC_LABEL: + case TargetOpcode::DBG_VALUE: + return true; + default: + return false; + } +} + +bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, + const MachineBasicBlock &To) const { + if (From.succ_empty()) + return false; + + unsigned NumInstr = 0; + const MachineFunction *MF = From.getParent(); + + for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); + MBBI != End && MBBI != ToI; ++MBBI) { + const MachineBasicBlock &MBB = *MBBI; + + for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); + NumInstr < SkipThreshold && I != E; ++I) { + if (opcodeEmitsNoInsts(I->getOpcode())) + continue; + + // FIXME: Since this is required for correctness, this should be inserted + // during SILowerControlFlow. + + // When a uniform loop is inside non-uniform control flow, the branch + // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken + // when EXEC = 0. We should skip the loop lest it becomes infinite. + if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || + I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) + return true; + + if (I->isInlineAsm()) { + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + const char *AsmStr = I->getOperand(0).getSymbolName(); + + // inlineasm length estimate is number of bytes assuming the longest + // instruction. + uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); + NumInstr += MaxAsmSize / MAI->getMaxInstLength(); + } else { + ++NumInstr; + } + + if (NumInstr >= SkipThreshold) + return true; + } + } + + return false; +} + +bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction *MF = MBB.getParent(); + + if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || + !shouldSkip(MBB, MBB.getParent()->back())) + return false; + + MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); + + const DebugLoc &DL = MI.getDebugLoc(); + + // If the exec mask is non-zero, skip the next two instructions + BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(&NextBB); + + MachineBasicBlock::iterator Insert = SkipBB->begin(); + + // Exec mask is zero: Export to NULL target... + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE)) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addImm(1) // vm + .addImm(0) // compr + .addImm(0); // en + + // ... and terminate wavefront. + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + + return true; +} + +void SIInsertSkips::kill(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + const MachineOperand &Op = MI.getOperand(0); + +#ifndef NDEBUG + CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); + // Kill is only allowed in pixel / geometry shaders. + assert(CallConv == CallingConv::AMDGPU_PS || + CallConv == CallingConv::AMDGPU_GS); +#endif + // Clear this thread from the exec mask if the operand is negative. + if (Op.isImm()) { + // Constant operand: Set exec mask to 0 or do nothing + if (Op.getImm() & 0x80000000) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addImm(0); + } + } else { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) + .addImm(0) + .addOperand(Op); + } +} + +MachineBasicBlock *SIInsertSkips::insertSkipBlock( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { + MachineFunction *MF = MBB.getParent(); + + MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, SkipBB); + MBB.addSuccessor(SkipBB); + + return SkipBB; +} + +// Returns true if a branch over the block was inserted. +bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, + MachineBasicBlock &SrcMBB) { + MachineBasicBlock *DestBB = MI.getOperand(0).getMBB(); + + if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB)) + return false; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator InsPt = std::next(MI.getIterator()); + + BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + .addMBB(DestBB); + + return true; +} + +bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + SkipThreshold = SkipThresholdFlag; + + bool HaveKill = false; + bool MadeChange = false; + + // Track depth of exec mask, divergent branches. + SmallVector<MachineBasicBlock *, 16> ExecBranchStack; + + MachineFunction::iterator NextBB; + + MachineBasicBlock *EmptyMBBAtEnd = nullptr; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; BI = NextBB) { + NextBB = std::next(BI); + MachineBasicBlock &MBB = *BI; + + if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) { + // Reached convergence point for last divergent branch. + ExecBranchStack.pop_back(); + } + + if (HaveKill && ExecBranchStack.empty()) { + HaveKill = false; + + // TODO: Insert skip if exec is 0? + } + + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + + MachineInstr &MI = *I; + + switch (MI.getOpcode()) { + case AMDGPU::SI_MASK_BRANCH: { + ExecBranchStack.push_back(MI.getOperand(0).getMBB()); + MadeChange |= skipMaskBranch(MI, MBB); + break; + } + case AMDGPU::S_BRANCH: { + // Optimize out branches to the next block. + // FIXME: Shouldn't this be handled by BranchFolding? + if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) + MI.eraseFromParent(); + break; + } + case AMDGPU::SI_KILL_TERMINATOR: { + MadeChange = true; + kill(MI); + + if (ExecBranchStack.empty()) { + if (skipIfDead(MI, *NextBB)) { + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } + } else { + HaveKill = true; + } + + MI.eraseFromParent(); + break; + } + case AMDGPU::SI_RETURN: { + // FIXME: Should move somewhere else + assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); + + // Graphics shaders returning non-void shouldn't contain S_ENDPGM, + // because external bytecode will be appended at the end. + if (BI != --MF.end() || I != MBB.getFirstTerminator()) { + // SI_RETURN is not the last instruction. Add an empty block at + // the end and jump there. + if (!EmptyMBBAtEnd) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + MBB.addSuccessor(EmptyMBBAtEnd); + BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(EmptyMBBAtEnd); + I->eraseFromParent(); + } + } + default: + break; + } + } + } + + return MadeChange; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp index d24588d..fceabd7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -21,6 +21,7 @@ #include "SIDefines.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -29,6 +30,7 @@ #define DEBUG_TYPE "si-insert-waits" using namespace llvm; +using namespace llvm::AMDGPU; namespace { @@ -59,13 +61,14 @@ private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; const MachineRegisterInfo *MRI; - - /// \brief Constant hardware limits - static const Counters WaitCounts; + IsaVersion IV; /// \brief Constant zero value static const Counters ZeroCounts; + /// \brief Hardware limits + Counters HardwareLimits; + /// \brief Counter values we have already waited on. Counters WaitedOn; @@ -90,6 +93,9 @@ private: bool LastInstWritesM0; + /// Whether or not we have flat operations outstanding. + bool IsFlatOutstanding; + /// \brief Whether the machine function returns void bool ReturnsVoid; @@ -145,7 +151,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { + StringRef getPassName() const override { return "SI insert wait instructions"; } @@ -170,11 +176,12 @@ FunctionPass *llvm::createSIInsertWaitsPass() { return new SIInsertWaits(); } -const Counters SIInsertWaits::WaitCounts = { { 15, 7, 15 } }; const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; -static bool readsVCCZ(unsigned Opcode) { - return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCZ; +static bool readsVCCZ(const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && + !MI.getOperand(1).isUndef(); } bool SIInsertWaits::hasOutstandingLGKM() const { @@ -188,8 +195,7 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); // Only consider stores or EXP for EXP_CNT - Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && - (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore())); + Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore(); // LGKM may uses larger values if (TSFlags & SIInstrFlags::LGKM_CNT) { @@ -231,9 +237,10 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { if (Op.isDef()) return true; - // For exports all registers are relevant + // For exports all registers are relevant. + // TODO: Skip undef/disabled registers. MachineInstr &MI = *Op.getParent(); - if (MI.getOpcode() == AMDGPU::EXP) + if (TII->isEXP(MI)) return true; // For stores the stored value is also relevant @@ -245,12 +252,6 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { // operand comes before the value operand and it may have // multiple data operands. - if (TII->isDS(MI) || TII->isFLAT(MI)) { - MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); - if (Data && Op.isIdenticalTo(*Data)) - return true; - } - if (TII->isDS(MI)) { MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); if (Data0 && Op.isIdenticalTo(*Data0)) @@ -260,6 +261,12 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { return Data1 && Op.isIdenticalTo(*Data1); } + if (TII->isFLAT(MI)) { + MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata); + if (Data && Op.isIdenticalTo(*Data)) + return true; + } + // NOTE: This assumes that the value operand is before the // address operand, and that there is only one value operand. for (MachineInstr::mop_iterator I = MI.operands_begin(), @@ -292,6 +299,9 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, Counters Limit = ZeroCounts; unsigned Sum = 0; + if (TII->mayAccessFlatAddressSpace(*I)) + IsFlatOutstanding = true; + for (unsigned i = 0; i < 3; ++i) { LastIssued.Array[i] += Increment.Array[i]; if (Increment.Array[i]) @@ -330,7 +340,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, // Remember which export instructions we have seen if (Increment.Named.EXP) { - ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2; + ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2; } for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { @@ -366,8 +376,9 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, // Figure out if the async instructions execute in order bool Ordered[3]; - // VM_CNT is always ordered - Ordered[0] = true; + // VM_CNT is always ordered except when there are flat instructions, which + // can return out of order. + Ordered[0] = !IsFlatOutstanding; // EXP_CNT is unordered if we have both EXP & VM-writes Ordered[1] = ExpInstrTypesSeen == 3; @@ -376,7 +387,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, Ordered[2] = false; // The values we are going to put into the S_WAITCNT instruction - Counters Counts = WaitCounts; + Counters Counts = HardwareLimits; // Do we really need to wait? bool NeedWait = false; @@ -392,7 +403,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, unsigned Value = LastIssued.Array[i] - Required.Array[i]; // Adjust the value to the real hardware possibilities. - Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); + Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]); } else Counts.Array[i] = 0; @@ -410,12 +421,14 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, // Build the wait instruction BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm((Counts.Named.VM & 0xF) | - ((Counts.Named.EXP & 0x7) << 4) | - ((Counts.Named.LGKM & 0xF) << 8)); + .addImm(encodeWaitcnt(IV, + Counts.Named.VM, + Counts.Named.EXP, + Counts.Named.LGKM)); LastOpcodeType = OTHER; LastInstWritesM0 = false; + IsFlatOutstanding = false; return true; } @@ -440,9 +453,9 @@ void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) { unsigned Imm = I->getOperand(0).getImm(); Counters Counts, WaitOn; - Counts.Named.VM = Imm & 0xF; - Counts.Named.EXP = (Imm >> 4) & 0x7; - Counts.Named.LGKM = (Imm >> 8) & 0xF; + Counts.Named.VM = decodeVmcnt(IV, Imm); + Counts.Named.EXP = decodeExpcnt(IV, Imm); + Counts.Named.LGKM = decodeLgkmcnt(IV, Imm); for (unsigned i = 0; i < 3; ++i) { if (Counts.Array[i] <= LastIssued.Array[i]) @@ -491,7 +504,7 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, return; // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. - if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) { + if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) { BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); LastInstWritesM0 = false; return; @@ -518,26 +531,40 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); + IV = getIsaVersion(ST->getFeatureBits()); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + HardwareLimits.Named.VM = getVmcntBitMask(IV); + HardwareLimits.Named.EXP = getExpcntBitMask(IV); + HardwareLimits.Named.LGKM = getLgkmcntBitMask(IV); WaitedOn = ZeroCounts; DelayedWaitOn = ZeroCounts; LastIssued = ZeroCounts; LastOpcodeType = OTHER; LastInstWritesM0 = false; - ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid(); + IsFlatOutstanding = false; + ReturnsVoid = MFI->returnsVoid(); memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); SmallVector<MachineInstr *, 4> RemoveMI; + SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; + + bool HaveScalarStores = false; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { + if (!HaveScalarStores && TII->isScalarStore(*I)) + HaveScalarStores = true; + if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { // There is a hardware bug on CI/SI where SMRD instruction may corrupt // vccz bit, so when we detect that an instruction may read from a @@ -557,7 +584,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { } // Check if we need to apply the bug work-around - if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) { + if (VCCZCorrupt && readsVCCZ(*I)) { DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n'); // Wait on everything, not just LGKM. vccz reads usually come from @@ -572,7 +599,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { // vcc and then writing it back to the register. BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), AMDGPU::VCC) - .addReg(AMDGPU::VCC); + .addReg(AMDGPU::VCC); } } @@ -590,8 +617,10 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, // but we also want to wait for any other outstanding transfers before // signalling other hardware blocks - if (I->getOpcode() == AMDGPU::S_BARRIER || - I->getOpcode() == AMDGPU::S_SENDMSG) + if ((I->getOpcode() == AMDGPU::S_BARRIER && + ST->needWaitcntBeforeBarrier()) || + I->getOpcode() == AMDGPU::S_SENDMSG || + I->getOpcode() == AMDGPU::S_SENDMSGHALT) Required = LastIssued; else Required = handleOperands(*I); @@ -605,12 +634,45 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { pushInstruction(MBB, I, Increment); handleSendMsg(MBB, I); + + if (I->getOpcode() == AMDGPU::S_ENDPGM || + I->getOpcode() == AMDGPU::SI_RETURN) + EndPgmBlocks.push_back(&MBB); } // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } + if (HaveScalarStores) { + // If scalar writes are used, the cache must be flushed or else the next + // wave to reuse the same scratch memory can be clobbered. + // + // Insert s_dcache_wb at wave termination points if there were any scalar + // stores, and only if the cache hasn't already been flushed. This could be + // improved by looking across blocks for flushes in postdominating blocks + // from the stores but an explicitly requested flush is probably very rare. + for (MachineBasicBlock *MBB : EndPgmBlocks) { + bool SeenDCacheWB = false; + + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + I != E; ++I) { + + if (I->getOpcode() == AMDGPU::S_DCACHE_WB) + SeenDCacheWB = true; + else if (TII->isScalarStore(*I)) + SeenDCacheWB = false; + + // FIXME: It would be better to insert this before a waitcnt if any. + if ((I->getOpcode() == AMDGPU::S_ENDPGM || + I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) { + Changes = true; + BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); + } + } + } + } + for (MachineInstr *I : RemoveMI) I->eraseFromParent(); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 6163f05..5523ec1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -15,78 +15,111 @@ class InstSI <dag outs, dag ins, string asm = "", list<dag> pattern = []> : AMDGPUInst<outs, ins, asm, pattern>, PredicateControl { - field bits<1> VM_CNT = 0; - field bits<1> EXP_CNT = 0; - field bits<1> LGKM_CNT = 0; - - field bits<1> SALU = 0; - field bits<1> VALU = 0; - - field bits<1> SOP1 = 0; - field bits<1> SOP2 = 0; - field bits<1> SOPC = 0; - field bits<1> SOPK = 0; - field bits<1> SOPP = 0; - - field bits<1> VOP1 = 0; - field bits<1> VOP2 = 0; - field bits<1> VOP3 = 0; - field bits<1> VOPC = 0; - field bits<1> SDWA = 0; - field bits<1> DPP = 0; - - field bits<1> MUBUF = 0; - field bits<1> MTBUF = 0; - field bits<1> SMRD = 0; - field bits<1> DS = 0; - field bits<1> MIMG = 0; - field bits<1> FLAT = 0; + // Low bits - basic encoding information. + field bit SALU = 0; + field bit VALU = 0; + + // SALU instruction formats. + field bit SOP1 = 0; + field bit SOP2 = 0; + field bit SOPC = 0; + field bit SOPK = 0; + field bit SOPP = 0; + + // VALU instruction formats. + field bit VOP1 = 0; + field bit VOP2 = 0; + field bit VOPC = 0; + field bit VOP3 = 0; + field bit VINTRP = 0; + field bit SDWA = 0; + field bit DPP = 0; + + // Memory instruction formats. + field bit MUBUF = 0; + field bit MTBUF = 0; + field bit SMRD = 0; + field bit MIMG = 0; + field bit EXP = 0; + field bit FLAT = 0; + field bit DS = 0; + + // Pseudo instruction formats. + field bit VGPRSpill = 0; + field bit SGPRSpill = 0; + + // High bits - other information. + field bit VM_CNT = 0; + field bit EXP_CNT = 0; + field bit LGKM_CNT = 0; // Whether WQM _must_ be enabled for this instruction. - field bits<1> WQM = 0; - field bits<1> VGPRSpill = 0; + field bit WQM = 0; + + // Whether WQM _must_ be disabled for this instruction. + field bit DisableWQM = 0; + + field bit Gather4 = 0; + + // Most sopk treat the immediate as a signed 16-bit, however some + // use it as unsigned. + field bit SOPKZext = 0; + + // This is an s_store_dword* instruction that requires a cache flush + // on wave termination. It is necessary to distinguish from mayStore + // SMEM instructions like the cache flush ones. + field bit ScalarStore = 0; + + // Whether the operands can be ignored when computing the + // instruction size. + field bit FixedSize = 0; // This bit tells the assembler to use the 32-bit encoding in case it // is unable to infer the encoding from the operands. - field bits<1> VOPAsmPrefer32Bit = 0; + field bit VOPAsmPrefer32Bit = 0; - field bits<1> Gather4 = 0; + // These need to be kept in sync with the enum in SIInstrFlags. + let TSFlags{0} = SALU; + let TSFlags{1} = VALU; - // Whether WQM _must_ be disabled for this instruction. - field bits<1> DisableWQM = 0; + let TSFlags{2} = SOP1; + let TSFlags{3} = SOP2; + let TSFlags{4} = SOPC; + let TSFlags{5} = SOPK; + let TSFlags{6} = SOPP; - // These need to be kept in sync with the enum in SIInstrFlags. - let TSFlags{0} = VM_CNT; - let TSFlags{1} = EXP_CNT; - let TSFlags{2} = LGKM_CNT; - - let TSFlags{3} = SALU; - let TSFlags{4} = VALU; - - let TSFlags{5} = SOP1; - let TSFlags{6} = SOP2; - let TSFlags{7} = SOPC; - let TSFlags{8} = SOPK; - let TSFlags{9} = SOPP; - - let TSFlags{10} = VOP1; - let TSFlags{11} = VOP2; - let TSFlags{12} = VOP3; - let TSFlags{13} = VOPC; + let TSFlags{7} = VOP1; + let TSFlags{8} = VOP2; + let TSFlags{9} = VOPC; + let TSFlags{10} = VOP3; + + let TSFlags{13} = VINTRP; let TSFlags{14} = SDWA; let TSFlags{15} = DPP; let TSFlags{16} = MUBUF; let TSFlags{17} = MTBUF; let TSFlags{18} = SMRD; - let TSFlags{19} = DS; - let TSFlags{20} = MIMG; + let TSFlags{19} = MIMG; + let TSFlags{20} = EXP; let TSFlags{21} = FLAT; - let TSFlags{22} = WQM; + let TSFlags{22} = DS; + let TSFlags{23} = VGPRSpill; - let TSFlags{24} = VOPAsmPrefer32Bit; - let TSFlags{25} = Gather4; - let TSFlags{26} = DisableWQM; + let TSFlags{24} = SGPRSpill; + + let TSFlags{32} = VM_CNT; + let TSFlags{33} = EXP_CNT; + let TSFlags{34} = LGKM_CNT; + + let TSFlags{35} = WQM; + let TSFlags{36} = DisableWQM; + let TSFlags{37} = Gather4; + + let TSFlags{38} = SOPKZext; + let TSFlags{39} = ScalarStore; + let TSFlags{40} = FixedSize; + let TSFlags{41} = VOPAsmPrefer32Bit; let SchedRW = [Write32Bit]; @@ -95,6 +128,7 @@ class InstSI <dag outs, dag ins, string asm = "", field bits<1> DisableDecoder = 0; let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1); + let AsmVariantName = AMDGPUAsmVariants.Default; } class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []> @@ -103,376 +137,39 @@ class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []> let isCodeGenOnly = 1; } -class Enc32 { - field bits<32> Inst; - int Size = 4; -} - -class Enc64 { - field bits<64> Inst; - int Size = 8; -} - -class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; - -let Uses = [EXEC] in { - -class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern> { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VALU = 1; -} - -class VOPCCommon <dag ins, string asm, list<dag> pattern> : - VOPAnyCommon <(outs), ins, asm, pattern> { - - let VOPC = 1; - let Size = 4; - let Defs = [VCC]; -} - -class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> : - VOPAnyCommon <outs, ins, asm, pattern> { - - let VOP1 = 1; - let Size = 4; -} - -class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> : - VOPAnyCommon <outs, ins, asm, pattern> { - - let VOP2 = 1; - let Size = 4; -} - -class VOP3Common <dag outs, dag ins, string asm = "", - list<dag> pattern = [], bit HasMods = 0, - bit VOP3Only = 0> : - VOPAnyCommon <outs, ins, asm, pattern> { - - // Using complex patterns gives VOP3 patterns a very high complexity rating, - // but standalone patterns are almost always prefered, so we need to adjust the - // priority lower. The goal is to use a high number to reduce complexity to - // zero (or less than zero). - let AddedComplexity = -1000; - - let VOP3 = 1; - let VALU = 1; - - let AsmMatchConverter = - !if(!eq(VOP3Only,1), - "cvtVOP3", - !if(!eq(HasMods,1), "cvtVOP3_2_mod", "")); - - let isCodeGenOnly = 0; - - int Size = 8; - - // Because SGPRs may be allowed if there are multiple operands, we - // need a post-isel hook to insert copies in order to avoid - // violating constant bus requirements. - let hasPostISelHook = 1; -} - -} // End Uses = [EXEC] - -//===----------------------------------------------------------------------===// -// Scalar operations -//===----------------------------------------------------------------------===// - -class SOP1e <bits<8> op> : Enc32 { - bits<7> sdst; - bits<8> src0; - - let Inst{7-0} = src0; - let Inst{15-8} = op; - let Inst{22-16} = sdst; - let Inst{31-23} = 0x17d; //encoding; -} - -class SOP2e <bits<7> op> : Enc32 { - bits<7> sdst; - bits<8> src0; - bits<8> src1; - - let Inst{7-0} = src0; - let Inst{15-8} = src1; - let Inst{22-16} = sdst; - let Inst{29-23} = op; - let Inst{31-30} = 0x2; // encoding -} - -class SOPCe <bits<7> op> : Enc32 { - bits<8> src0; - bits<8> src1; - - let Inst{7-0} = src0; - let Inst{15-8} = src1; - let Inst{22-16} = op; - let Inst{31-23} = 0x17e; -} - -class SOPKe <bits<5> op> : Enc32 { - bits <7> sdst; - bits <16> simm16; - - let Inst{15-0} = simm16; - let Inst{22-16} = sdst; - let Inst{27-23} = op; - let Inst{31-28} = 0xb; //encoding -} - -class SOPK64e <bits<5> op> : Enc64 { - bits <7> sdst = 0; - bits <16> simm16; - bits <32> imm; - - let Inst{15-0} = simm16; - let Inst{22-16} = sdst; - let Inst{27-23} = op; - let Inst{31-28} = 0xb; - - let Inst{63-32} = imm; -} - -class SOPPe <bits<7> op> : Enc32 { - bits <16> simm16; - - let Inst{15-0} = simm16; - let Inst{22-16} = op; - let Inst{31-23} = 0x17f; // encoding -} - -class SMRDe <bits<5> op, bits<1> imm> : Enc32 { - bits<7> sdst; - bits<7> sbase; - - let Inst{8} = imm; - let Inst{14-9} = sbase{6-1}; - let Inst{21-15} = sdst; - let Inst{26-22} = op; - let Inst{31-27} = 0x18; //encoding -} - -class SMRD_IMMe <bits<5> op> : SMRDe<op, 1> { - bits<8> offset; - let Inst{7-0} = offset; -} - -class SMRD_SOFFe <bits<5> op> : SMRDe<op, 0> { - bits<8> soff; - let Inst{7-0} = soff; -} - - - -class SMRD_IMMe_ci <bits<5> op> : Enc64 { - bits<7> sdst; - bits<7> sbase; - bits<32> offset; - - let Inst{7-0} = 0xff; - let Inst{8} = 0; - let Inst{14-9} = sbase{6-1}; - let Inst{21-15} = sdst; - let Inst{26-22} = op; - let Inst{31-27} = 0x18; //encoding - let Inst{63-32} = offset; -} - -let SchedRW = [WriteSALU] in { -class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI<outs, ins, asm, pattern> { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let isCodeGenOnly = 0; - let SALU = 1; - let SOP1 = 1; -} - -class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern> { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let isCodeGenOnly = 0; - let SALU = 1; - let SOP2 = 1; - - let UseNamedOperandTable = 1; -} - -class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI<outs, ins, asm, pattern>, SOPCe <op> { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; +class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = []> + : PseudoInstSI<outs, ins, pattern> { let SALU = 1; - let SOPC = 1; - let isCodeGenOnly = 0; - let Defs = [SCC]; - - let UseNamedOperandTable = 1; } -class SOPK <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins , asm, pattern> { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; - let SOPK = 1; - - let UseNamedOperandTable = 1; +class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = []> + : PseudoInstSI<outs, ins, pattern> { + let VALU = 1; + let Uses = [EXEC]; } -class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> : - InstSI <(outs), ins, asm, pattern >, SOPPe <op> { +class CFPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], + bit UseExec = 0, bit DefExec = 0> : + SPseudoInstSI<outs, ins, pattern> { + let Uses = !if(UseExec, [EXEC], []); + let Defs = !if(DefExec, [EXEC, SCC], [SCC]); let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let SALU = 1; - let SOPP = 1; - - let UseNamedOperandTable = 1; -} - -} // let SchedRW = [WriteSALU] - -class SMRD <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI<outs, ins, asm, pattern> { - - let LGKM_CNT = 1; - let SMRD = 1; - let mayStore = 0; - let mayLoad = 1; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let SchedRW = [WriteSMEM]; -} - -//===----------------------------------------------------------------------===// -// Vector ALU operations -//===----------------------------------------------------------------------===// - -class VOP1e <bits<8> op> : Enc32 { - bits<8> vdst; - bits<9> src0; - - let Inst{8-0} = src0; - let Inst{16-9} = op; - let Inst{24-17} = vdst; - let Inst{31-25} = 0x3f; //encoding -} - -class VOP2e <bits<6> op> : Enc32 { - bits<8> vdst; - bits<9> src0; - bits<8> src1; - - let Inst{8-0} = src0; - let Inst{16-9} = src1; - let Inst{24-17} = vdst; - let Inst{30-25} = op; - let Inst{31} = 0x0; //encoding -} - -class VOP2_MADKe <bits<6> op> : Enc64 { - - bits<8> vdst; - bits<9> src0; - bits<8> src1; - bits<32> imm; - - let Inst{8-0} = src0; - let Inst{16-9} = src1; - let Inst{24-17} = vdst; - let Inst{30-25} = op; - let Inst{31} = 0x0; // encoding - let Inst{63-32} = imm; -} - -class VOP3a <bits<9> op> : Enc64 { - bits<2> src0_modifiers; - bits<9> src0; - bits<2> src1_modifiers; - bits<9> src1; - bits<2> src2_modifiers; - bits<9> src2; - bits<1> clamp; - bits<2> omod; - - let Inst{8} = src0_modifiers{1}; - let Inst{9} = src1_modifiers{1}; - let Inst{10} = src2_modifiers{1}; - let Inst{11} = clamp; - let Inst{25-17} = op; - let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = src0; - let Inst{49-41} = src1; - let Inst{58-50} = src2; - let Inst{60-59} = omod; - let Inst{61} = src0_modifiers{0}; - let Inst{62} = src1_modifiers{0}; - let Inst{63} = src2_modifiers{0}; -} - -class VOP3e <bits<9> op> : VOP3a <op> { - bits<8> vdst; - - let Inst{7-0} = vdst; } -// Encoding used for VOPC instructions encoded as VOP3 -// Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst -class VOP3ce <bits<9> op> : VOP3a <op> { - bits<8> sdst; - - let Inst{7-0} = sdst; +class Enc32 { + field bits<32> Inst; + int Size = 4; } -class VOP3be <bits<9> op> : Enc64 { - bits<8> vdst; - bits<2> src0_modifiers; - bits<9> src0; - bits<2> src1_modifiers; - bits<9> src1; - bits<2> src2_modifiers; - bits<9> src2; - bits<7> sdst; - bits<2> omod; - - let Inst{7-0} = vdst; - let Inst{14-8} = sdst; - let Inst{25-17} = op; - let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = src0; - let Inst{49-41} = src1; - let Inst{58-50} = src2; - let Inst{60-59} = omod; - let Inst{61} = src0_modifiers{0}; - let Inst{62} = src1_modifiers{0}; - let Inst{63} = src2_modifiers{0}; +class Enc64 { + field bits<64> Inst; + int Size = 8; } -class VOPCe <bits<8> op> : Enc32 { - bits<9> src0; - bits<8> src1; - - let Inst{8-0} = src0; - let Inst{16-9} = src1; - let Inst{24-17} = op; - let Inst{31-25} = 0x3e; -} +class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; class VINTRPe <bits<2> op> : Enc32 { bits<8> vdst; @@ -488,88 +185,6 @@ class VINTRPe <bits<2> op> : Enc32 { let Inst{31-26} = 0x32; // encoding } -class DSe <bits<8> op> : Enc64 { - bits<8> vdst; - bits<1> gds; - bits<8> addr; - bits<8> data0; - bits<8> data1; - bits<8> offset0; - bits<8> offset1; - - let Inst{7-0} = offset0; - let Inst{15-8} = offset1; - let Inst{17} = gds; - let Inst{25-18} = op; - let Inst{31-26} = 0x36; //encoding - let Inst{39-32} = addr; - let Inst{47-40} = data0; - let Inst{55-48} = data1; - let Inst{63-56} = vdst; -} - -class MUBUFe <bits<7> op> : Enc64 { - bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<1> addr64; - bits<1> lds; - bits<8> vaddr; - bits<8> vdata; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{15} = addr64; - let Inst{16} = lds; - let Inst{24-18} = op; - let Inst{31-26} = 0x38; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{54} = slc; - let Inst{55} = tfe; - let Inst{63-56} = soffset; -} - -class MTBUFe <bits<3> op> : Enc64 { - bits<8> vdata; - bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<1> addr64; - bits<4> dfmt; - bits<3> nfmt; - bits<8> vaddr; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{15} = addr64; - let Inst{18-16} = op; - let Inst{22-19} = dfmt; - let Inst{25-23} = nfmt; - let Inst{31-26} = 0x3a; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{54} = slc; - let Inst{55} = tfe; - let Inst{63-56} = soffset; -} - class MIMGe <bits<7> op> : Enc64 { bits<8> vdata; bits<4> dmask; @@ -600,26 +215,6 @@ class MIMGe <bits<7> op> : Enc64 { let Inst{57-53} = ssamp{6-2}; } -class FLATe<bits<7> op> : Enc64 { - bits<8> addr; - bits<8> data; - bits<8> vdst; - bits<1> slc; - bits<1> glc; - bits<1> tfe; - - // 15-0 is reserved. - let Inst{16} = glc; - let Inst{17} = slc; - let Inst{24-18} = op; - let Inst{31-26} = 0x37; // Encoding. - let Inst{39-32} = addr; - let Inst{47-40} = data; - // 54-48 is reserved. - let Inst{55} = tfe; - let Inst{63-56} = vdst; -} - class EXPe : Enc64 { bits<4> en; bits<6> tgt; @@ -645,92 +240,37 @@ class EXPe : Enc64 { let Uses = [EXEC] in { -class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : - VOP1Common <outs, ins, asm, pattern>, - VOP1e<op> { - let isCodeGenOnly = 0; -} - -class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : - VOP2Common <outs, ins, asm, pattern>, VOP2e<op> { - let isCodeGenOnly = 0; -} - -class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : - VOPCCommon <ins, asm, pattern>, VOPCe <op>; - class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> : InstSI <outs, ins, asm, pattern> { - let mayLoad = 1; + let VINTRP = 1; + // VINTRP instructions read parameter values from LDS, but these parameter + // values are stored outside of the LDS memory that is allocated to the + // shader for general purpose use. + // + // While it may be possible for ds_read/ds_write instructions to access + // the parameter values in LDS, this would essentially be an out-of-bounds + // memory access which we consider to be undefined behavior. + // + // So even though these instructions read memory, this memory is outside the + // addressable memory space for the shader, and we consider these instructions + // to be readnone. + let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; } -} // End Uses = [EXEC] - -//===----------------------------------------------------------------------===// -// Vector I/O operations -//===----------------------------------------------------------------------===// - -class DS <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern> { - - let LGKM_CNT = 1; - let DS = 1; - let UseNamedOperandTable = 1; - let Uses = [M0, EXEC]; - - // Most instruction load and store data, so set this as the default. - let mayLoad = 1; - let mayStore = 1; - - let hasSideEffects = 0; - let AsmMatchConverter = "cvtDS"; - let SchedRW = [WriteLDS]; -} - -class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI<outs, ins, asm, pattern> { - - let VM_CNT = 1; +class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { + let EXP = 1; let EXP_CNT = 1; - let MUBUF = 1; - let Uses = [EXEC]; - - let hasSideEffects = 0; + let mayLoad = 0; // Set to 1 if done bit is set. + let mayStore = 1; let UseNamedOperandTable = 1; - let AsmMatchConverter = "cvtMubuf"; - let SchedRW = [WriteVMEM]; -} - -class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI<outs, ins, asm, pattern> { - - let VM_CNT = 1; - let EXP_CNT = 1; - let MTBUF = 1; let Uses = [EXEC]; - - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let SchedRW = [WriteVMEM]; + let SchedRW = [WriteExport]; } -class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI<outs, ins, asm, pattern>, FLATe <op> { - let FLAT = 1; - // Internally, FLAT instruction are executed as both an LDS and a - // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT - // and are not considered done until both have been decremented. - let VM_CNT = 1; - let LGKM_CNT = 1; - - let Uses = [EXEC, FLAT_SCR]; // M0 - - let UseNamedOperandTable = 1; - let hasSideEffects = 0; - let SchedRW = [WriteVMEM]; -} +} // End Uses = [EXEC] class MIMG <dag outs, dag ins, string asm, list<dag> pattern> : InstSI <outs, ins, asm, pattern> { diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 9190819..26a8d22 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -28,6 +28,13 @@ using namespace llvm; +// Must be at least 4 to be able to branch over minimum unconditional branch +// code. This is only for making it possible to write reasonably small tests for +// long branches. +static cl::opt<unsigned> +BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), + cl::desc("Restrict range of branch instructions (DEBUG)")); + SIInstrInfo::SIInstrInfo(const SISubtarget &ST) : AMDGPUInstrInfo(ST), RI(), ST(ST) {} @@ -258,7 +265,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, } if (isMUBUF(LdSt) || isMTBUF(LdSt)) { - if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) + const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); + if (SOffset && SOffset->isReg()) return false; const MachineOperand *AddrReg = @@ -270,6 +278,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, getNamedOperand(LdSt, AMDGPU::OpName::offset); BaseReg = AddrReg->getReg(); Offset = OffsetImm->getImm(); + + if (SOffset) // soffset can be an inline immediate. + Offset += SOffset->getImm(); + return true; } @@ -287,7 +299,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, } if (isFLAT(LdSt)) { - const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr); + const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); BaseReg = AddrReg->getReg(); Offset = 0; return true; @@ -302,20 +314,16 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, const MachineOperand *FirstDst = nullptr; const MachineOperand *SecondDst = nullptr; - if (isDS(FirstLdSt) && isDS(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } - - if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); - } - if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) { FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); + } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); + } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); } if (!FirstDst || !SecondDst) @@ -342,62 +350,32 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { + const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); - // If we are trying to copy to or from SCC, there is a bug somewhere else in - // the backend. While it may be theoretically possible to do this, it should - // never be necessary. - assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); - - static const int16_t Sub0_15[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, - AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, - AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, - }; - - static const int16_t Sub0_15_64[] = { - AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, - AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, - AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, - AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, - }; - - static const int16_t Sub0_7[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, - }; - - static const int16_t Sub0_7_64[] = { - AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, - AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, - }; - - static const int16_t Sub0_3[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - }; - - static const int16_t Sub0_3_64[] = { - AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, - }; - - static const int16_t Sub0_2[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, - }; - - static const int16_t Sub0_1[] = { - AMDGPU::sub0, AMDGPU::sub1, - }; + if (RC == &AMDGPU::VGPR_32RegClass) { + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || + AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } - unsigned Opcode; - ArrayRef<int16_t> SubIndices; + if (RC == &AMDGPU::SReg_32_XM0RegClass || + RC == &AMDGPU::SReg_32RegClass) { + if (SrcReg == AMDGPU::SCC) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) + .addImm(-1) + .addImm(0); + return; + } - if (AMDGPU::SReg_32RegClass.contains(DestReg)) { assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; + } - } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { + if (RC == &AMDGPU::SReg_64RegClass) { if (DestReg == AMDGPU::VCC) { if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) @@ -405,7 +383,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else { // FIXME: Hack until VReg_1 removed. assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) + BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) .addImm(0) .addReg(SrcReg, getKillRegState(KillSrc)); } @@ -417,62 +395,29 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; + } - } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B64; - SubIndices = Sub0_3_64; - - } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B64; - SubIndices = Sub0_7_64; - - } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B64; - SubIndices = Sub0_15_64; - - } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { - assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || - AMDGPU::SReg_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + if (DestReg == AMDGPU::SCC) { + assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0); return; + } - } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || - AMDGPU::SReg_64RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_1; - - } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_2; - - } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || - AMDGPU::SReg_128RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_3; - - } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || - AMDGPU::SReg_256RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_7; - - } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || - AMDGPU::SReg_512RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_15; - - } else { - llvm_unreachable("Can't copy register!"); + unsigned EltSize = 4; + unsigned Opcode = AMDGPU::V_MOV_B32_e32; + if (RI.isSGPRClass(RC)) { + if (RC->getSize() > 4) { + Opcode = AMDGPU::S_MOV_B64; + EltSize = 8; + } else { + Opcode = AMDGPU::S_MOV_B32; + EltSize = 4; + } } + ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { @@ -497,9 +442,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } } -int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { - const unsigned Opcode = MI.getOpcode(); - +int SIInstrInfo::commuteOpcode(unsigned Opcode) const { int NewOpc; // Try to map original to commuted opcode @@ -573,11 +516,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - unsigned Size = FrameInfo->getObjectSize(FrameIndex); - unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); + unsigned Size = FrameInfo.getObjectSize(FrameIndex); + unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); MachineMemOperand *MMO @@ -587,20 +530,31 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); + // We are only allowed to create one new instruction when spilling + // registers, so we need to use pseudo instruction for spilling SGPRs. + const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize())); + + // The SGPR spill/restore instructions only work on number sgprs, so we need + // to make sure we are using the correct register class. if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { - // m0 may not be allowed for readlane. MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } - // We are only allowed to create one new instruction when spilling - // registers, so we need to use pseudo instruction for spilling - // SGPRs. - unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); - BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg, getKillRegState(isKill)) // src - .addFrameIndex(FrameIndex) // frame_idx - .addMemOperand(MMO); + MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) + .addReg(SrcReg, getKillRegState(isKill)) // data + .addFrameIndex(FrameIndex) // addr + .addMemOperand(MMO) + .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) + .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); + // Add the scratch resource registers as implicit uses because we may end up + // needing them, and need to ensure that the reserved registers are + // correctly handled. + + if (ST.hasScalarStores()) { + // m0 is used for offset to scalar stores if used to spill. + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + } return; } @@ -620,11 +574,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg, getKillRegState(isKill)) // src - .addFrameIndex(FrameIndex) // frame_idx - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset - .addImm(0) // offset + .addReg(SrcReg, getKillRegState(isKill)) // data + .addFrameIndex(FrameIndex) // addr + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } @@ -671,10 +625,10 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); - unsigned Size = FrameInfo->getObjectSize(FrameIndex); + unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); + unsigned Size = FrameInfo.getObjectSize(FrameIndex); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); @@ -685,17 +639,22 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (RI.isSGPRClass(RC)) { // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. - unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); - + const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize())); if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { - // m0 may not be allowed for readlane. MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } - BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // frame_idx - .addMemOperand(MMO); + MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) + .addFrameIndex(FrameIndex) // addr + .addMemOperand(MMO) + .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) + .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); + + if (ST.hasScalarStores()) { + // m0 is used for offset to scalar stores if used to spill. + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + } return; } @@ -713,7 +672,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // frame_idx + .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addImm(0) // offset @@ -729,7 +688,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); + unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); unsigned TIDReg = MFI->getTIDReg(); @@ -808,7 +767,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( } // Add FrameIndex to LDS offset - unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); + unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) .addImm(LDSOffset) .addReg(TIDReg); @@ -851,7 +810,24 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - + case AMDGPU::S_MOV_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_MOV_B64)); + break; + } + case AMDGPU::S_XOR_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_XOR_B64)); + break; + } + case AMDGPU::S_ANDN2_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_ANDN2_B64)); + break; + } case AMDGPU::V_MOV_B64_PSEUDO: { unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -880,36 +856,37 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + case AMDGPU::V_MOVRELD_B32_V1: + case AMDGPU::V_MOVRELD_B32_V2: + case AMDGPU::V_MOVRELD_B32_V4: + case AMDGPU::V_MOVRELD_B32_V8: + case AMDGPU::V_MOVRELD_B32_V16: { + const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); + unsigned VecReg = MI.getOperand(0).getReg(); + bool IsUndef = MI.getOperand(1).isUndef(); + unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); + assert(VecReg == MI.getOperand(1).getReg()); + + MachineInstr *MovRel = + BuildMI(MBB, MI, DL, MovRelDesc) + .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) + .addOperand(MI.getOperand(2)) + .addReg(VecReg, RegState::ImplicitDefine) + .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); + + const int ImpDefIdx = + MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); + const int ImpUseIdx = ImpDefIdx + 1; + MovRel->tieOperands(ImpDefIdx, ImpUseIdx); - case AMDGPU::V_CNDMASK_B64_PSEUDO: { - unsigned Dst = MI.getOperand(0).getReg(); - unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); - unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); - unsigned Src0 = MI.getOperand(1).getReg(); - unsigned Src1 = MI.getOperand(2).getReg(); - const MachineOperand &SrcCond = MI.getOperand(3); - - BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) - .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) - .addReg(SrcCond.getReg()) - .addReg(Dst, RegState::Implicit | RegState::Define); - BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) - .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) - .addReg(SrcCond.getReg(), getKillRegState(SrcCond.isKill())) - .addReg(Dst, RegState::Implicit | RegState::Define); MI.eraseFromParent(); break; } - case AMDGPU::SI_PC_ADD_REL_OFFSET: { - const SIRegisterInfo *TRI - = static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); MachineFunction &MF = *MBB.getParent(); unsigned Reg = MI.getOperand(0).getReg(); - unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); - unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); + unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); + unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); // Create a bundle so these instructions won't be re-ordered by the // post-RA scheduler. @@ -921,10 +898,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) .addReg(RegLo) .addOperand(MI.getOperand(1))); - Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) - .addReg(RegHi) - .addImm(0)); + MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi); + if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) + MIB.addImm(0); + else + MIB.addOperand(MI.getOperand(2)); + + Bundler.append(MIB); llvm::finalizeBundle(MBB, Bundler.begin()); MI.eraseFromParent(); @@ -934,91 +916,96 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } -/// Commutes the operands in the given instruction. -/// The commutable operands are specified by their indices OpIdx0 and OpIdx1. -/// -/// Do not call this method for a non-commutable instruction or for -/// non-commutable pair of operand indices OpIdx0 and OpIdx1. -/// Even though the instruction is commutable, the method may still -/// fail to commute the operands, null pointer is returned in such cases. -MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, - unsigned OpIdx0, - unsigned OpIdx1) const { - int CommutedOpcode = commuteOpcode(MI); - if (CommutedOpcode == -1) - return nullptr; +bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, + MachineOperand &Src0, + unsigned Src0OpName, + MachineOperand &Src1, + unsigned Src1OpName) const { + MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); + if (!Src0Mods) + return false; - int Src0Idx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - if (!Src0.isReg()) + MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); + assert(Src1Mods && + "All commutable instructions have both src0 and src1 modifiers"); + + int Src0ModsVal = Src0Mods->getImm(); + int Src1ModsVal = Src1Mods->getImm(); + + Src1Mods->setImm(Src0ModsVal); + Src0Mods->setImm(Src1ModsVal); + return true; +} + +static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, + MachineOperand &RegOp, + MachineOperand &NonRegOp) { + unsigned Reg = RegOp.getReg(); + unsigned SubReg = RegOp.getSubReg(); + bool IsKill = RegOp.isKill(); + bool IsDead = RegOp.isDead(); + bool IsUndef = RegOp.isUndef(); + bool IsDebug = RegOp.isDebug(); + + if (NonRegOp.isImm()) + RegOp.ChangeToImmediate(NonRegOp.getImm()); + else if (NonRegOp.isFI()) + RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); + else return nullptr; - int Src1Idx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); + NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); + NonRegOp.setSubReg(SubReg); - if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || - OpIdx1 != static_cast<unsigned>(Src1Idx)) && - (OpIdx0 != static_cast<unsigned>(Src1Idx) || - OpIdx1 != static_cast<unsigned>(Src0Idx))) + return &MI; +} + +MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, + unsigned Src0Idx, + unsigned Src1Idx) const { + assert(!NewMI && "this should never be used"); + + unsigned Opc = MI.getOpcode(); + int CommutedOpcode = commuteOpcode(Opc); + if (CommutedOpcode == -1) return nullptr; - MachineOperand &Src1 = MI.getOperand(Src1Idx); + assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == + static_cast<int>(Src0Idx) && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == + static_cast<int>(Src1Idx) && + "inconsistency with findCommutedOpIndices"); - if (isVOP2(MI) || isVOPC(MI)) { - const MCInstrDesc &InstrDesc = MI.getDesc(); - // For VOP2 and VOPC instructions, any operand type is valid to use for - // src0. Make sure we can use the src0 as src1. - // - // We could be stricter here and only allow commuting if there is a reason - // to do so. i.e. if both operands are VGPRs there is no real benefit, - // although MachineCSE attempts to find matches by commuting. - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) - return nullptr; - } + MachineOperand &Src0 = MI.getOperand(Src0Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); - MachineInstr *CommutedMI = &MI; - if (!Src1.isReg()) { - // Allow commuting instructions with Imm operands. - if (NewMI || !Src1.isImm() || (!isVOP2(MI) && !isVOP3(MI))) { - return nullptr; + MachineInstr *CommutedMI = nullptr; + if (Src0.isReg() && Src1.isReg()) { + if (isOperandLegal(MI, Src1Idx, &Src0)) { + // Be sure to copy the source modifiers to the right place. + CommutedMI + = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); } - // Be sure to copy the source modifiers to the right place. - if (MachineOperand *Src0Mods = - getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) { - MachineOperand *Src1Mods = - getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); - - int Src0ModsVal = Src0Mods->getImm(); - if (!Src1Mods && Src0ModsVal != 0) - return nullptr; - - // XXX - This assert might be a lie. It might be useful to have a neg - // modifier with 0.0. - int Src1ModsVal = Src1Mods->getImm(); - assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); - - Src1Mods->setImm(Src0ModsVal); - Src0Mods->setImm(Src1ModsVal); - } - - unsigned Reg = Src0.getReg(); - unsigned SubReg = Src0.getSubReg(); - if (Src1.isImm()) - Src0.ChangeToImmediate(Src1.getImm()); - else - llvm_unreachable("Should only have immediates"); - Src1.ChangeToRegister(Reg, false); - Src1.setSubReg(SubReg); + } else if (Src0.isReg() && !Src1.isReg()) { + // src0 should always be able to support any operand type, so no need to + // check operand legality. + CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); + } else if (!Src0.isReg() && Src1.isReg()) { + if (isOperandLegal(MI, Src1Idx, &Src0)) + CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); } else { - CommutedMI = - TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); + // FIXME: Found two non registers to commute. This does happen. + return nullptr; } - if (CommutedMI) + + if (CommutedMI) { + swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, + Src1, AMDGPU::OpName::src1_modifiers); + CommutedMI->setDesc(get(CommutedOpcode)); + } return CommutedMI; } @@ -1028,8 +1015,7 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, // TargetInstrInfo::commuteInstruction uses it. bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const { - const MCInstrDesc &MCID = MI.getDesc(); - if (!MCID.isCommutable()) + if (!MI.isCommutable()) return false; unsigned Opc = MI.getOpcode(); @@ -1037,34 +1023,135 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, if (Src0Idx == -1) return false; - // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on - // immediate. Also, immediate src0 operand is not handled in - // SIInstrInfo::commuteInstruction(); - if (!MI.getOperand(Src0Idx).isReg()) - return false; - int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return false; - MachineOperand &Src1 = MI.getOperand(Src1Idx); - if (Src1.isImm()) { - // SIInstrInfo::commuteInstruction() does support commuting the immediate - // operand src1 in 2 and 3 operand instructions. - if (!isVOP2(MI.getOpcode()) && !isVOP3(MI.getOpcode())) - return false; - } else if (Src1.isReg()) { - // If any source modifiers are set, the generic instruction commuting won't - // understand how to copy the source modifiers. - if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)) - return false; - } else - return false; - return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } +bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, + int64_t BrOffset) const { + // BranchRelaxation should never have to check s_setpc_b64 because its dest + // block is unanalyzable. + assert(BranchOp != AMDGPU::S_SETPC_B64); + + // Convert to dwords. + BrOffset /= 4; + + // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is + // from the next instruction. + BrOffset -= 1; + + return isIntN(BranchOffsetBits, BrOffset); +} + +MachineBasicBlock *SIInstrInfo::getBranchDestBlock( + const MachineInstr &MI) const { + if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { + // This would be a difficult analysis to perform, but can always be legal so + // there's no need to analyze it. + return nullptr; + } + + return MI.getOperand(0).getMBB(); +} + +unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, + MachineBasicBlock &DestBB, + const DebugLoc &DL, + int64_t BrOffset, + RegScavenger *RS) const { + assert(RS && "RegScavenger required for long branching"); + assert(MBB.empty() && + "new block should be inserted for expanding unconditional branch"); + assert(MBB.pred_size() == 1); + + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // FIXME: Virtual register workaround for RegScavenger not working with empty + // blocks. + unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + auto I = MBB.end(); + + // We need to compute the offset relative to the instruction immediately after + // s_getpc_b64. Insert pc arithmetic code before last terminator. + MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); + + // TODO: Handle > 32-bit block address. + if (BrOffset >= 0) { + BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub0) + .addReg(PCReg, 0, AMDGPU::sub0) + .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); + BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub1) + .addReg(PCReg, 0, AMDGPU::sub1) + .addImm(0); + } else { + // Backwards branch. + BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub0) + .addReg(PCReg, 0, AMDGPU::sub0) + .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); + BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub1) + .addReg(PCReg, 0, AMDGPU::sub1) + .addImm(0); + } + + // Insert the indirect branch after the other terminator. + BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) + .addReg(PCReg); + + // FIXME: If spilling is necessary, this will fail because this scavenger has + // no emergency stack slots. It is non-trivial to spill in this situation, + // because the restore code needs to be specially placed after the + // jump. BranchRelaxation then needs to be made aware of the newly inserted + // block. + // + // If a spill is needed for the pc register pair, we need to insert a spill + // restore block right before the destination block, and insert a short branch + // into the old destination block's fallthrough predecessor. + // e.g.: + // + // s_cbranch_scc0 skip_long_branch: + // + // long_branch_bb: + // spill s[8:9] + // s_getpc_b64 s[8:9] + // s_add_u32 s8, s8, restore_bb + // s_addc_u32 s9, s9, 0 + // s_setpc_b64 s[8:9] + // + // skip_long_branch: + // foo; + // + // ..... + // + // dest_bb_fallthrough_predecessor: + // bar; + // s_branch dest_bb + // + // restore_bb: + // restore s[8:9] + // fallthrough dest_bb + /// + // dest_bb: + // buzz; + + RS->enterBasicBlockEnd(MBB); + unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, + MachineBasicBlock::iterator(GetPC), 0); + MRI.replaceRegWith(PCReg, Scav); + MRI.clearVirtRegs(); + RS->setRegUsed(Scav); + + return 4 + 8 + 4 + 4; +} + unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { switch (Cond) { case SIInstrInfo::SCC_TRUE: @@ -1103,15 +1190,12 @@ SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { } } -bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const { - MachineBasicBlock::iterator I = MBB.getFirstTerminator(); - - if (I == MBB.end()) - return false; - +bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { if (I->getOpcode() == AMDGPU::S_BRANCH) { // Unconditional Branch TBB = I->getOperand(0).getMBB(); @@ -1124,6 +1208,7 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); Cond.push_back(MachineOperand::CreateImm(Pred)); + Cond.push_back(I->getOperand(1)); // Save the branch register. ++I; @@ -1142,29 +1227,81 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, return true; } -unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { +bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + if (I == MBB.end()) + return false; + + if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) + return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); + + ++I; + + // TODO: Should be able to treat as fallthrough? + if (I == MBB.end()) + return true; + + if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) + return true; + + MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); + + // Specifically handle the case where the conditional branch is to the same + // destination as the mask branch. e.g. + // + // si_mask_branch BB8 + // s_cbranch_execz BB8 + // s_cbranch BB9 + // + // This is required to understand divergent loops which may need the branches + // to be relaxed. + if (TBB != MaskBrDest || Cond.empty()) + return true; + + auto Pred = Cond[0].getImm(); + return (Pred != EXECZ && Pred != EXECNZ); +} + +unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, + int *BytesRemoved) const { MachineBasicBlock::iterator I = MBB.getFirstTerminator(); unsigned Count = 0; + unsigned RemovedSize = 0; while (I != MBB.end()) { MachineBasicBlock::iterator Next = std::next(I); + if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { + I = Next; + continue; + } + + RemovedSize += getInstSizeInBytes(*I); I->eraseFromParent(); ++Count; I = Next; } + if (BytesRemoved) + *BytesRemoved = RemovedSize; + return Count; } -unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB, +unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, - const DebugLoc &DL) const { + const DebugLoc &DL, + int *BytesAdded) const { if (!FBB && Cond.empty()) { BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) .addMBB(TBB); + if (BytesAdded) + *BytesAdded = 4; return 1; } @@ -1174,24 +1311,42 @@ unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB, = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); if (!FBB) { - BuildMI(&MBB, DL, get(Opcode)) + Cond[1].isUndef(); + MachineInstr *CondBr = + BuildMI(&MBB, DL, get(Opcode)) .addMBB(TBB); + + // Copy the flags onto the implicit condition register operand. + MachineOperand &CondReg = CondBr->getOperand(1); + CondReg.setIsUndef(Cond[1].isUndef()); + CondReg.setIsKill(Cond[1].isKill()); + + if (BytesAdded) + *BytesAdded = 4; return 1; } assert(TBB && FBB); - BuildMI(&MBB, DL, get(Opcode)) + MachineInstr *CondBr = + BuildMI(&MBB, DL, get(Opcode)) .addMBB(TBB); BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) .addMBB(FBB); + MachineOperand &CondReg = CondBr->getOperand(1); + CondReg.setIsUndef(Cond[1].isUndef()); + CondReg.setIsKill(Cond[1].isKill()); + + if (BytesAdded) + *BytesAdded = 8; + return 2; } -bool SIInstrInfo::ReverseBranchCondition( +bool SIInstrInfo::reverseBranchCondition( SmallVectorImpl<MachineOperand> &Cond) const { - assert(Cond.size() == 1); + assert(Cond.size() == 2); Cond[0].setImm(-Cond[0].getImm()); return false; } @@ -1210,15 +1365,43 @@ static void removeModOperands(MachineInstr &MI) { MI.RemoveOperand(Src0ModIdx); } -// TODO: Maybe this should be removed this and custom fold everything in -// SIFoldOperands? bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) return false; unsigned Opc = UseMI.getOpcode(); - if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::COPY) { + bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); + switch (DefMI.getOpcode()) { + default: + return false; + case AMDGPU::S_MOV_B64: + // TODO: We could fold 64-bit immediates, but this get compilicated + // when there are sub-registers. + return false; + + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::S_MOV_B32: + break; + } + unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; + const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); + assert(ImmOp); + // FIXME: We could handle FrameIndex values here. + if (!ImmOp->isImm()) { + return false; + } + UseMI.setDesc(get(NewOpc)); + UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); + UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); + return true; + } + + if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { + bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; + // Don't fold if we are using source modifiers. The new VOP2 instructions // don't have them. if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || @@ -1232,14 +1415,16 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // If this is a free constant, there's no reason to do this. // TODO: We could fold this here instead of letting SIFoldOperands do it // later. - if (isInlineConstant(ImmOp, 4)) + MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); + + // Any src operand can be used for the legality check. + if (isInlineConstant(UseMI, *Src0, ImmOp)) return false; - MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); - // Multiplied part is the constant: Use v_madmk_f32 + // Multiplied part is the constant: Use v_madmk_{f16, f32}. // We should only expect these to be on src0 due to canonicalizations. if (Src0->isReg() && Src0->getReg() == Reg) { if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) @@ -1267,15 +1452,15 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Src0->setSubReg(Src1SubReg); Src0->setIsKill(Src1->isKill()); - if (Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_MAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); - } Src1->ChangeToImmediate(Imm); removeModOperands(UseMI); - UseMI.setDesc(get(AMDGPU::V_MADMK_F32)); + UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -1284,7 +1469,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, return true; } - // Added part is the constant: Use v_madak_f32 + // Added part is the constant: Use v_madak_{f16, f32}. if (Src2->isReg() && Src2->getReg() == Reg) { // Not allowed to use constant bus for another operand. // We can however allow an inline immediate as src0. @@ -1306,17 +1491,17 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, UseMI.RemoveOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); - if (Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_MAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); - } // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); // These come before src2. removeModOperands(UseMI); - UseMI.setDesc(get(AMDGPU::V_MADAK_F32)); + UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -1375,6 +1560,17 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; + if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { + const MachineMemOperand *MMOa = *MIa.memoperands_begin(); + const MachineMemOperand *MMOb = *MIb.memoperands_begin(); + if (MMOa->getValue() && MMOb->getValue()) { + MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); + MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); + if (!AA->alias(LocA, LocB)) + return true; + } + } + // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the // underlying address space, even if it was lowered to a different one, @@ -1414,15 +1610,22 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, MachineInstr &MI, LiveVariables *LV) const { + bool IsF16 = false; switch (MI.getOpcode()) { default: return nullptr; + case AMDGPU::V_MAC_F16_e64: + IsF16 = true; case AMDGPU::V_MAC_F32_e64: break; + case AMDGPU::V_MAC_F16_e32: + IsF16 = true; case AMDGPU::V_MAC_F32_e32: { - const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); - if (Src0->isImm() && !isInlineConstant(*Src0, 4)) + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::src0); + const MachineOperand *Src0 = &MI.getOperand(Src0Idx); + if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) return nullptr; break; } @@ -1433,7 +1636,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); - return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32)) + return BuildMI(*MBB, MI, MI.getDebugLoc(), + get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) .addOperand(*Dst) .addImm(0) // Src0 mods .addOperand(*Src0) @@ -1445,6 +1649,20 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, .addImm(0); // omod } +// It's not generally safe to move VALU instructions across these since it will +// start using the register as a base index rather than directly. +// XXX - Why isn't hasSideEffects sufficient for these? +static bool changesVGPRIndexingMode(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::S_SET_GPR_IDX_ON: + case AMDGPU::S_SET_GPR_IDX_MODE: + case AMDGPU::S_SET_GPR_IDX_OFF: + return true; + default: + return false; + } +} + bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const { @@ -1454,67 +1672,78 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, // when they operate on VGPRs. Treating EXEC modifications as scheduling // boundaries prevents incorrect movements of such instructions. return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || - MI.modifiesRegister(AMDGPU::EXEC, &RI); + MI.modifiesRegister(AMDGPU::EXEC, &RI) || + MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || + MI.getOpcode() == AMDGPU::S_SETREG_B32 || + changesVGPRIndexingMode(MI); } bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { - int64_t SVal = Imm.getSExtValue(); - if (SVal >= -16 && SVal <= 64) - return true; - - if (Imm.getBitWidth() == 64) { - uint64_t Val = Imm.getZExtValue(); - return (DoubleToBits(0.0) == Val) || - (DoubleToBits(1.0) == Val) || - (DoubleToBits(-1.0) == Val) || - (DoubleToBits(0.5) == Val) || - (DoubleToBits(-0.5) == Val) || - (DoubleToBits(2.0) == Val) || - (DoubleToBits(-2.0) == Val) || - (DoubleToBits(4.0) == Val) || - (DoubleToBits(-4.0) == Val); - } - - // The actual type of the operand does not seem to matter as long - // as the bits match one of the inline immediate values. For example: - // - // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, - // so it is a legal inline immediate. - // - // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in - // floating-point, so it is a legal inline immediate. - uint32_t Val = Imm.getZExtValue(); - - return (FloatToBits(0.0f) == Val) || - (FloatToBits(1.0f) == Val) || - (FloatToBits(-1.0f) == Val) || - (FloatToBits(0.5f) == Val) || - (FloatToBits(-0.5f) == Val) || - (FloatToBits(2.0f) == Val) || - (FloatToBits(-2.0f) == Val) || - (FloatToBits(4.0f) == Val) || - (FloatToBits(-4.0f) == Val); + switch (Imm.getBitWidth()) { + case 32: + return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), + ST.hasInv2PiInlineImm()); + case 64: + return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), + ST.hasInv2PiInlineImm()); + case 16: + return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), + ST.hasInv2PiInlineImm()); + default: + llvm_unreachable("invalid bitwidth"); + } } bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, - unsigned OpSize) const { - if (MO.isImm()) { - // MachineOperand provides no way to tell the true operand size, since it - // only records a 64-bit value. We need to know the size to determine if a - // 32-bit floating point immediate bit pattern is legal for an integer - // immediate. It would be for any 32-bit integer operand, but would not be - // for a 64-bit one. + uint8_t OperandType) const { + if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET) + return false; - unsigned BitSize = 8 * OpSize; - return isInlineConstant(APInt(BitSize, MO.getImm(), true)); - } + // MachineOperand provides no way to tell the true operand size, since it only + // records a 64-bit value. We need to know the size to determine if a 32-bit + // floating point immediate bit pattern is legal for an integer immediate. It + // would be for any 32-bit integer operand, but would not be for a 64-bit one. + + int64_t Imm = MO.getImm(); + switch (operandBitWidth(OperandType)) { + case 32: { + int32_t Trunc = static_cast<int32_t>(Imm); + return Trunc == Imm && + AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); + } + case 64: { + return AMDGPU::isInlinableLiteral64(MO.getImm(), + ST.hasInv2PiInlineImm()); + } + case 16: { + if (isInt<16>(Imm) || isUInt<16>(Imm)) { + int16_t Trunc = static_cast<int16_t>(Imm); + return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); + } - return false; + return false; + } + default: + llvm_unreachable("invalid bitwidth"); + } } -bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, - unsigned OpSize) const { - return MO.isImm() && !isInlineConstant(MO, OpSize); +bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, + const MCOperandInfo &OpInfo) const { + switch (MO.getType()) { + case MachineOperand::MO_Register: + return false; + case MachineOperand::MO_Immediate: + return !isInlineConstant(MO, OpInfo); + case MachineOperand::MO_FrameIndex: + case MachineOperand::MO_MachineBasicBlock: + case MachineOperand::MO_ExternalSymbol: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_MCSymbol: + return true; + default: + llvm_unreachable("unexpected operand type"); + } } static bool compareMachineOp(const MachineOperand &Op0, @@ -1544,11 +1773,10 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, if (OpInfo.RegClass < 0) return false; - unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); - if (isLiteralConstant(MO, OpSize)) - return RI.opCanUseLiteralConstant(OpInfo.OperandType); + if (MO.isImm() && isInlineConstant(MO, OpInfo)) + return RI.opCanUseInlineConstant(OpInfo.OperandType); - return RI.opCanUseInlineConstant(OpInfo.OperandType); + return RI.opCanUseLiteralConstant(OpInfo.OperandType); } bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { @@ -1575,12 +1803,17 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, - unsigned OpSize) const { + const MCOperandInfo &OpInfo) const { // Literal constants use the constant bus. - if (isLiteralConstant(MO, OpSize)) - return true; + //if (isLiteralConstantLike(MO, OpInfo)) + // return true; + if (MO.isImm()) + return !isInlineConstant(MO, OpInfo); + + if (!MO.isReg()) + return true; // Misc other operands like FrameIndex - if (!MO.isReg() || !MO.isUse()) + if (!MO.isUse()) return false; if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) @@ -1644,6 +1877,16 @@ static bool shouldReadExec(const MachineInstr &MI) { return true; } +static bool isSubRegOf(const SIRegisterInfo &TRI, + const MachineOperand &SuperVec, + const MachineOperand &SubReg) { + if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) + return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); + + return SubReg.getSubReg() != AMDGPU::NoSubRegister && + SubReg.getReg() == SuperVec.getReg(); +} + bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { uint16_t Opcode = MI.getOpcode(); @@ -1660,6 +1903,28 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } + if (MI.isInlineAsm()) { + // Verify register classes for inlineasm constraints. + for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); + I != E; ++I) { + const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); + if (!RC) + continue; + + const MachineOperand &Op = MI.getOperand(I); + if (!Op.isReg()) + continue; + + unsigned Reg = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { + ErrInfo = "inlineasm operand has incorrect register class."; + return false; + } + } + + return true; + } + // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { if (MI.getOperand(i).isFPImm()) { @@ -1677,15 +1942,22 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } break; - case AMDGPU::OPERAND_REG_IMM32: + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: break; - case AMDGPU::OPERAND_REG_INLINE_C: - if (isLiteralConstant(MI.getOperand(i), - RI.getRegClass(RegClass)->getSize())) { + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { ErrInfo = "Illegal immediate value for operand."; return false; } break; + } case MCOI::OPERAND_IMMEDIATE: case AMDGPU::OPERAND_KIMM32: // Check if this operand is an immediate. @@ -1695,7 +1967,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, ErrInfo = "Expected immediate, but got non-immediate"; return false; } - // Fall-through + LLVM_FALLTHROUGH; default: continue; } @@ -1737,7 +2009,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (OpIdx == -1) break; const MachineOperand &MO = MI.getOperand(OpIdx); - if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { + if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { if (MO.isReg()) { if (MO.getReg() != SGPRUsed) ++ConstantBusCount; @@ -1768,6 +2040,65 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (isSOPK(MI)) { + int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); + if (sopkIsZext(MI)) { + if (!isUInt<16>(Imm)) { + ErrInfo = "invalid immediate for SOPK instruction"; + return false; + } + } else { + if (!isInt<16>(Imm)) { + ErrInfo = "invalid immediate for SOPK instruction"; + return false; + } + } + } + + if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || + Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || + Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || + Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { + const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || + Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; + + const unsigned StaticNumOps = Desc.getNumOperands() + + Desc.getNumImplicitUses(); + const unsigned NumImplicitOps = IsDst ? 2 : 1; + + // Allow additional implicit operands. This allows a fixup done by the post + // RA scheduler where the main implicit operand is killed and implicit-defs + // are added for sub-registers that remain live after this instruction. + if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { + ErrInfo = "missing implicit register operands"; + return false; + } + + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); + if (IsDst) { + if (!Dst->isUse()) { + ErrInfo = "v_movreld_b32 vdst should be a use operand"; + return false; + } + + unsigned UseOpIdx; + if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || + UseOpIdx != StaticNumOps + 1) { + ErrInfo = "movrel implicit operands should be tied"; + return false; + } + } + + const MachineOperand &Src0 = MI.getOperand(Src0Idx); + const MachineOperand &ImpUse + = MI.getOperand(StaticNumOps + NumImplicitOps - 1); + if (!ImpUse.isReg() || !ImpUse.isUse() || + !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { + ErrInfo = "src0 should be subreg of implicit vector use"; + return false; + } + } + // Make sure we aren't losing exec uses in the td files. This mostly requires // being careful when using let Uses to try to add other use registers. if (shouldReadExec(MI)) { @@ -1777,6 +2108,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (isSMRD(MI)) { + if (MI.mayStore()) { + // The register offset form of scalar stores may only use m0 as the + // soffset register. + const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); + if (Soff && Soff->getReg() != AMDGPU::M0) { + ErrInfo = "scalar stores must use m0 as offset register"; + return false; + } + } + } + return true; } @@ -1797,13 +2140,13 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; - case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; - case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; - case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; - case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; - case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; - case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; - case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; + case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; + case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; + case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; + case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; + case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; + case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; + case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; @@ -1830,6 +2173,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; + case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; + case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; @@ -1937,11 +2282,10 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm( unsigned SubIdx, const TargetRegisterClass *SubRC) const { if (Op.isImm()) { - // XXX - Is there a better way to do this? if (SubIdx == AMDGPU::sub0) - return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); + return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); if (SubIdx == AMDGPU::sub1) - return MachineOperand::CreateImm(Op.getImm() >> 32); + return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); llvm_unreachable("Unhandled register index for immediate"); } @@ -1978,8 +2322,8 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, // In order to be legal, the common sub-class must be equal to the // class of the current operand. For example: // - // v_mov_b32 s0 ; Operand defined as vsrc_32 - // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL + // v_mov_b32 s0 ; Operand defined as vsrc_b32 + // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL // // s_sendmsg 0, s0 ; Operand defined as m0reg // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL @@ -2008,7 +2352,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, if (!MO) MO = &MI.getOperand(OpIdx); - if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { + if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { RegSubRegPair SGPRUsed; if (MO->isReg()) @@ -2020,7 +2364,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand &Op = MI.getOperand(i); if (Op.isReg()) { if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && - usesConstantBus(MRI, Op, getOpSize(MI, i))) { + usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { return false; } } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { @@ -2202,6 +2546,39 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, } } +void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, + MachineBasicBlock::iterator I, + const TargetRegisterClass *DstRC, + MachineOperand &Op, + MachineRegisterInfo &MRI, + const DebugLoc &DL) const { + + unsigned OpReg = Op.getReg(); + unsigned OpSubReg = Op.getSubReg(); + + const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( + RI.getRegClassForReg(MRI, OpReg), OpSubReg); + + // Check if operand is already the correct register class. + if (DstRC == OpRC) + return; + + unsigned DstReg = MRI.createVirtualRegister(DstRC); + MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg) + .addOperand(Op); + + Op.setReg(DstReg); + Op.setSubReg(0); + + MachineInstr *Def = MRI.getVRegDef(OpReg); + if (!Def) + return; + + // Try to eliminate the copy if it is copying an immediate value. + if (Def->isMoveImmediate()) + FoldImmediate(*Copy, *Def, OpReg, &MRI); +} + void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { MachineFunction &MF = *MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -2260,15 +2637,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; - unsigned DstReg = MRI.createVirtualRegister(RC); // MI is a PHI instruction. MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); - BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); - Op.setReg(DstReg); + // Avoid creating no-op copies with the same src and dst reg class. These + // confuse some of the machine passes. + legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); } } @@ -2292,12 +2668,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { if (VRC == OpRC) continue; - unsigned DstReg = MRI.createVirtualRegister(VRC); - - BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); - - Op.setReg(DstReg); + legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); Op.setIsKill(); } } @@ -2313,11 +2684,9 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); if (DstRC != Src0RC) { - MachineBasicBlock &MBB = *MI.getParent(); - unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); - BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0) - .addReg(Src0); - MI.getOperand(1).setReg(NewSrc0); + MachineBasicBlock *MBB = MI.getParent(); + MachineOperand &Op = MI.getOperand(1); + legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); } return; } @@ -2664,6 +3033,22 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { continue; unsigned DstReg = Inst.getOperand(0).getReg(); + if (Inst.isCopy() && + TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && + NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { + // Instead of creating a copy where src and dst are the same register + // class, we just replace all uses of dst with src. These kinds of + // copies interfere with the heuristics MachineSink uses to decide + // whether or not to split a critical edge. Since the pass assumes + // that copies will end up as machine instructions and not be + // eliminated. + addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); + MRI.clearKillFlags(Inst.getOperand(1).getReg()); + Inst.getOperand(0).setReg(DstReg); + continue; + } + NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); } @@ -2927,10 +3312,16 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( MachineRegisterInfo &MRI, SmallVectorImpl<MachineInstr *> &Worklist) const { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), - E = MRI.use_end(); I != E; ++I) { + E = MRI.use_end(); I != E;) { MachineInstr &UseMI = *I->getParent(); if (!canReadVGPR(UseMI, I.getOperandNo())) { Worklist.push_back(&UseMI); + + do { + ++I; + } while (I != E && I->getParent() == &UseMI); + } else { + ++I; } } } @@ -3098,6 +3489,56 @@ bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); } +unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, + int &FrameIndex) const { + const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); + if (!Addr || !Addr->isFI()) + return AMDGPU::NoRegister; + + assert(!MI.memoperands_empty() && + (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); + + FrameIndex = Addr->getIndex(); + return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); +} + +unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, + int &FrameIndex) const { + const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); + assert(Addr && Addr->isFI()); + FrameIndex = Addr->getIndex(); + return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); +} + +unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex) const { + + if (!MI.mayLoad()) + return AMDGPU::NoRegister; + + if (isMUBUF(MI) || isVGPRSpill(MI)) + return isStackAccess(MI, FrameIndex); + + if (isSGPRSpill(MI)) + return isSGPRStackAccess(MI, FrameIndex); + + return AMDGPU::NoRegister; +} + +unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex) const { + if (!MI.mayStore()) + return AMDGPU::NoRegister; + + if (isMUBUF(MI) || isVGPRSpill(MI)) + return isStackAccess(MI, FrameIndex); + + if (isSGPRSpill(MI)) + return isSGPRStackAccess(MI, FrameIndex); + + return AMDGPU::NoRegister; +} + unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); @@ -3105,32 +3546,45 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { // If we have a definitive size, we can use it. Otherwise we need to inspect // the operands to know the size. - if (DescSize == 8 || DescSize == 4) + // + // FIXME: Instructions that have a base 32-bit encoding report their size as + // 4, even though they are really 8 bytes if they have a literal operand. + if (DescSize != 0 && DescSize != 4) return DescSize; - assert(DescSize == 0); + if (Opc == AMDGPU::WAVE_BARRIER) + return 0; // 4-byte instructions may have a 32-bit literal encoded after them. Check // operands that coud ever be literals. if (isVALU(MI) || isSALU(MI)) { + if (isFixedSize(MI)) { + assert(DescSize == 4); + return DescSize; + } + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) return 4; // No operands. - if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx))) + if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) return 8; int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return 4; - if (isLiteralConstant(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx))) + if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) return 8; return 4; } + if (DescSize == 4) + return 4; + switch (Opc) { + case AMDGPU::SI_MASK_BRANCH: case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: case TargetOpcode::DBG_VALUE: @@ -3147,6 +3601,20 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { } } +bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { + if (!isFLAT(MI)) + return false; + + if (MI.memoperands_empty()) + return true; + + for (const MachineMemOperand *MMO : MI.memoperands()) { + if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) + return true; + } + return false; +} + ArrayRef<std::pair<int, const char *>> SIInstrInfo::getSerializableTargetIndices() const { static const std::pair<int, const char *> TargetIndices[] = { diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h index fef8904..e68f6f9 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -86,6 +86,10 @@ private: unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; protected: + bool swapSourceModifiers(MachineInstr &MI, + MachineOperand &Src0, unsigned Src0OpName, + MachineOperand &Src1, unsigned Src1OpName) const; + MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override; @@ -94,7 +98,18 @@ public: enum TargetOperandFlags { MO_NONE = 0, - MO_GOTPCREL = 1 + // MO_GOTPCREL -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL. + MO_GOTPCREL = 1, + // MO_GOTPCREL32_LO -> symbol@gotpcrel32@lo -> R_AMDGPU_GOTPCREL32_LO. + MO_GOTPCREL32 = 2, + MO_GOTPCREL32_LO = 2, + // MO_GOTPCREL32_HI -> symbol@gotpcrel32@hi -> R_AMDGPU_GOTPCREL32_HI. + MO_GOTPCREL32_HI = 3, + // MO_REL32_LO -> symbol@rel32@lo -> R_AMDGPU_REL32_LO. + MO_REL32 = 4, + MO_REL32_LO = 4, + // MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI. + MO_REL32_HI = 5 }; explicit SIInstrInfo(const SISubtarget &); @@ -144,23 +159,48 @@ public: unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; LLVM_READONLY - int commuteOpcode(const MachineInstr &MI) const; + int commuteOpcode(unsigned Opc) const; + + LLVM_READONLY + inline int commuteOpcode(const MachineInstr &MI) const { + return commuteOpcode(MI.getOpcode()); + } bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; + bool isBranchOffsetInRange(unsigned BranchOpc, + int64_t BrOffset) const override; + + MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override; + + unsigned insertIndirectBranch(MachineBasicBlock &MBB, + MachineBasicBlock &NewDestBB, + const DebugLoc &DL, + int64_t BrOffset, + RegScavenger *RS = nullptr) const override; + + bool analyzeBranchImpl(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const; + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; - unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + unsigned removeBranch(MachineBasicBlock &MBB, + int *BytesRemoved = nullptr) const override; - unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, - const DebugLoc &DL) const override; + const DebugLoc &DL, + int *BytesAdded = nullptr) const override; - bool ReverseBranchCondition( + bool reverseBranchCondition( SmallVectorImpl<MachineOperand> &Cond) const override; bool @@ -332,6 +372,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FLAT; } + static bool isEXP(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::EXP; + } + + bool isEXP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::EXP; + } + static bool isWQM(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::WQM; } @@ -356,6 +404,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; } + static bool isSGPRSpill(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SGPRSpill; + } + + bool isSGPRSpill(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill; + } + static bool isDPP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::DPP; } @@ -372,6 +428,32 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT; } + static bool sopkIsZext(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPK_ZEXT; + } + + bool sopkIsZext(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPK_ZEXT; + } + + /// \returns true if this is an s_store_dword* instruction. This is more + /// specific than than isSMEM && mayStore. + static bool isScalarStore(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SCALAR_STORE; + } + + bool isScalarStore(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SCALAR_STORE; + } + + static bool isFixedSize(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FIXED_SIZE; + } + + bool isFixedSize(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); @@ -380,9 +462,96 @@ public: return !RI.isSGPRReg(MRI, Dest); } + static int operandBitWidth(uint8_t OperandType) { + switch (OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + return 32; + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + return 64; + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + return 16; + default: + llvm_unreachable("unexpected operand type"); + } + } + bool isInlineConstant(const APInt &Imm) const; - bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; - bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; + + bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const; + + bool isInlineConstant(const MachineOperand &MO, + const MCOperandInfo &OpInfo) const { + return isInlineConstant(MO, OpInfo.OperandType); + } + + /// \p returns true if \p UseMO is substituted with \p DefMO in \p MI it would + /// be an inline immediate. + bool isInlineConstant(const MachineInstr &MI, + const MachineOperand &UseMO, + const MachineOperand &DefMO) const { + assert(UseMO.getParent() == &MI); + int OpIdx = MI.getOperandNo(&UseMO); + if (!MI.getDesc().OpInfo || OpIdx >= MI.getDesc().NumOperands) { + return false; + } + + return isInlineConstant(DefMO, MI.getDesc().OpInfo[OpIdx]); + } + + /// \p returns true if the operand \p OpIdx in \p MI is a valid inline + /// immediate. + bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx) const { + const MachineOperand &MO = MI.getOperand(OpIdx); + return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType); + } + + bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx, + const MachineOperand &MO) const { + if (!MI.getDesc().OpInfo || OpIdx >= MI.getDesc().NumOperands) + return false; + + if (MI.isCopy()) { + unsigned Size = getOpSize(MI, OpIdx); + assert(Size == 8 || Size == 4); + + uint8_t OpType = (Size == 8) ? + AMDGPU::OPERAND_REG_IMM_INT64 : AMDGPU::OPERAND_REG_IMM_INT32; + return isInlineConstant(MO, OpType); + } + + return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType); + } + + bool isInlineConstant(const MachineOperand &MO) const { + const MachineInstr *Parent = MO.getParent(); + return isInlineConstant(*Parent, Parent->getOperandNo(&MO)); + } + + bool isLiteralConstant(const MachineOperand &MO, + const MCOperandInfo &OpInfo) const { + return MO.isImm() && !isInlineConstant(MO, OpInfo.OperandType); + } + + bool isLiteralConstant(const MachineInstr &MI, int OpIdx) const { + const MachineOperand &MO = MI.getOperand(OpIdx); + return MO.isImm() && !isInlineConstant(MI, OpIdx); + } + + // Returns true if this operand could potentially require a 32-bit literal + // operand, but not necessarily. A FrameIndex for example could resolve to an + // inline immediate value that will not require an additional 4-bytes; this + // assumes that it will. + bool isLiteralConstantLike(const MachineOperand &MO, + const MCOperandInfo &OpInfo) const; bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const; @@ -394,7 +563,7 @@ public: /// \brief Returns true if this operand uses the constant bus. bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, - unsigned OpSize) const; + const MCOperandInfo &OpInfo) const; /// \brief Return true if this instruction has any modifiers. /// e.g. src[012]_mod, omod, clamp. @@ -487,6 +656,12 @@ public: void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const; + void legalizeGenericOperand(MachineBasicBlock &InsertMBB, + MachineBasicBlock::iterator I, + const TargetRegisterClass *DstRC, + MachineOperand &Op, MachineRegisterInfo &MRI, + const DebugLoc &DL) const; + /// \brief Legalize all operands in this instruction. This function may /// create new instruction and insert them before \p MI. void legalizeOperands(MachineInstr &MI) const; @@ -535,7 +710,17 @@ public: return get(pseudoToMCOpcode(Opcode)); } - unsigned getInstSizeInBytes(const MachineInstr &MI) const; + unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const; + unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const; + + unsigned isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex) const override; + unsigned isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex) const override; + + unsigned getInstSizeInBytes(const MachineInstr &MI) const override; + + bool mayAccessFlatAddressSpace(const MachineInstr &MI) const; ArrayRef<std::pair<int, const char *>> getSerializableTargetIndices() const override; @@ -570,10 +755,19 @@ namespace AMDGPU { LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode); + LLVM_READONLY + int getSOPKOp(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23); + + // For MachineOperands. + enum TargetFlags { + TF_LONG_BRANCH_FORWARD = 1 << 0, + TF_LONG_BRANCH_BACKWARD = 1 << 1 + }; } // End namespace AMDGPU namespace SI { diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 00f53e8..ebaefae 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -14,75 +14,6 @@ def isCIOnly : Predicate<"Subtarget->getGeneration() ==" def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; -class vop { - field bits<9> SI3; - field bits<10> VI3; -} - -class vopc <bits<8> si, bits<8> vi = !add(0x40, si)> : vop { - field bits<8> SI = si; - field bits<8> VI = vi; - - field bits<9> SI3 = {0, si{7-0}}; - field bits<10> VI3 = {0, 0, vi{7-0}}; -} - -class vop1 <bits<8> si, bits<8> vi = si> : vop { - field bits<8> SI = si; - field bits<8> VI = vi; - - field bits<9> SI3 = {1, 1, si{6-0}}; - field bits<10> VI3 = !add(0x140, vi); -} - -class vop2 <bits<6> si, bits<6> vi = si> : vop { - field bits<6> SI = si; - field bits<6> VI = vi; - - field bits<9> SI3 = {1, 0, 0, si{5-0}}; - field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}}; -} - -// Specify a VOP2 opcode for SI and VOP3 opcode for VI -// that doesn't have VOP2 encoding on VI -class vop23 <bits<6> si, bits<10> vi> : vop2 <si> { - let VI3 = vi; -} - -class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop { - let SI3 = si; - let VI3 = vi; -} - -class sop1 <bits<8> si, bits<8> vi = si> { - field bits<8> SI = si; - field bits<8> VI = vi; -} - -class sop2 <bits<7> si, bits<7> vi = si> { - field bits<7> SI = si; - field bits<7> VI = vi; -} - -class sopk <bits<5> si, bits<5> vi = si> { - field bits<5> SI = si; - field bits<5> VI = vi; -} - -class dsop <bits<8> si, bits<8> vi = si> { - field bits<8> SI = si; - field bits<8> VI = vi; -} - -// Specify an SMRD opcode for SI and SMEM opcode for VI - -// FIXME: This should really be bits<5> si, Tablegen crashes if -// parameter default value is other parameter with different bit size -class smrd<bits<8> si, bits<8> vi = si> { - field bits<5> SI = si{4-0}; - field bits<8> VI = vi; -} - // Execpt for the NONE field, this must be kept in sync with the // SIEncodingFamily enum in AMDGPUInstrInfo.cpp def SIEncodingFamily { @@ -127,6 +58,19 @@ def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", [SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; +def SDTBufferLoad : SDTypeProfile<1, 5, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex + SDTCisVT<3, i32>, // offset + SDTCisVT<4, i1>, // glc + SDTCisVT<5, i1>]>; // slc + +def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; + def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>, SDTCisVT<3, i32>]> @@ -143,72 +87,15 @@ def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET", - SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]> + SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]> >; //===----------------------------------------------------------------------===// -// PatFrags for FLAT instructions -//===----------------------------------------------------------------------===// - -class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr), - (ld node:$ptr), [{ - const MemSDNode *LD = cast<MemSDNode>(N); - return LD->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || - LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || - LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; -}]>; - -def flat_load : flat_ld <load>; -def atomic_flat_load : flat_ld<atomic_load>; -def flat_az_extloadi8 : flat_ld <az_extloadi8>; -def flat_sextloadi8 : flat_ld <sextloadi8>; -def flat_az_extloadi16 : flat_ld <az_extloadi16>; -def flat_sextloadi16 : flat_ld <sextloadi16>; - -class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - const MemSDNode *ST = cast<MemSDNode>(N); - return ST->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || - ST->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; -}]>; - -def flat_store: flat_st <store>; -def atomic_flat_store: flat_st <atomic_store>; -def flat_truncstorei8 : flat_st <truncstorei8>; -def flat_truncstorei16 : flat_st <truncstorei16>; - -class MubufLoad <SDPatternOperator op> : PatFrag < - (ops node:$ptr), (op node:$ptr), [{ - - const MemSDNode *LD = cast<MemSDNode>(N); - return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || - LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; -}]>; - -def mubuf_load : MubufLoad <load>; -def mubuf_az_extloadi8 : MubufLoad <az_extloadi8>; -def mubuf_sextloadi8 : MubufLoad <sextloadi8>; -def mubuf_az_extloadi16 : MubufLoad <az_extloadi16>; -def mubuf_sextloadi16 : MubufLoad <sextloadi16>; - -def mubuf_load_atomic : MubufLoad <atomic_load>; - -def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ - auto Ld = cast<LoadSDNode>(N); - return Ld->getAlignment() >= 4 && - Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N); -}]>; - -//===----------------------------------------------------------------------===// // PatFrags for global memory operations //===----------------------------------------------------------------------===// -def atomic_inc_global : global_binary_atomic_op<SIatomic_inc>; -def atomic_dec_global : global_binary_atomic_op<SIatomic_dec>; - -def atomic_inc_flat : flat_binary_atomic_op<SIatomic_inc>; -def atomic_dec_flat : flat_binary_atomic_op<SIatomic_dec>; +defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>; +defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>; //===----------------------------------------------------------------------===// // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 @@ -338,36 +225,6 @@ def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>; -// Transformation function, extract the lower 32bit of a 64bit immediate -def LO32 : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N), - MVT::i32); -}]>; - -def LO32f : SDNodeXForm<fpimm, [{ - APInt V = N->getValueAPF().bitcastToAPInt().trunc(32); - return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32); -}]>; - -// Transformation function, extract the upper 32bit of a 64bit immediate -def HI32 : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() >> 32, SDLoc(N), MVT::i32); -}]>; - -def HI32f : SDNodeXForm<fpimm, [{ - APInt V = N->getValueAPF().bitcastToAPInt().lshr(32).trunc(32); - return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), SDLoc(N), - MVT::f32); -}]>; - -def IMM8bitDWORD : PatLeaf <(imm), - [{return (N->getZExtValue() & ~0x3FC) == 0;}] ->; - -def as_dword_i32imm : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() >> 2, SDLoc(N), MVT::i32); -}]>; - def as_i1imm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); }]>; @@ -394,24 +251,17 @@ return CurDAG->getTargetConstant( N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); }]>; +def frameindex_to_targetframeindex : SDNodeXForm<frameindex, [{ + auto FI = cast<FrameIndexSDNode>(N); + return CurDAG->getTargetFrameIndex(FI->getIndex(), MVT::i32); +}]>; + // Copied from the AArch64 backend: def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{ return CurDAG->getTargetConstant( N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64); }]>; -def IMM8bit : PatLeaf <(imm), - [{return isUInt<8>(N->getZExtValue());}] ->; - -def IMM12bit : PatLeaf <(imm), - [{return isUInt<12>(N->getZExtValue());}] ->; - -def IMM16bit : PatLeaf <(imm), - [{return isUInt<16>(N->getZExtValue());}] ->; - def SIMM16bit : PatLeaf <(imm), [{return isInt<16>(N->getSExtValue());}] >; @@ -420,15 +270,6 @@ def IMM20bit : PatLeaf <(imm), [{return isUInt<20>(N->getZExtValue());}] >; -def IMM32bit : PatLeaf <(imm), - [{return isUInt<32>(N->getZExtValue());}] ->; - -def mubuf_vaddr_offset : PatFrag< - (ops node:$ptr, node:$offset, node:$imm_offset), - (add (add node:$ptr, node:$offset), node:$imm_offset) ->; - class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{ return isInlineImmediate(N); }]>; @@ -437,29 +278,31 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{ return isInlineImmediate(N); }]>; -class SGPRImm <dag frag> : PatLeaf<frag, [{ +class VGPRImm <dag frag> : PatLeaf<frag, [{ if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) { return false; } const SIRegisterInfo *SIRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + unsigned Limit = 0; for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); - U != E; ++U) { + Limit < 10 && U != E; ++U, ++Limit) { const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); - if (RC && SIRI->isSGPRClass(RC)) - return true; + + // If the register class is unknown, it could be an unknown + // register class that needs to be an SGPR, e.g. an inline asm + // constraint + if (!RC || SIRI->isSGPRClass(RC)) + return false; } - return false; + + return Limit < 10; }]>; //===----------------------------------------------------------------------===// // Custom Operands //===----------------------------------------------------------------------===// -def FRAMEri32 : Operand<iPTR> { - let MIOperandInfo = (ops i32:$ptr, i32imm:$index); -} - def SoppBrTarget : AsmOperandClass { let Name = "SoppBrTarget"; let ParserMethod = "parseSOppBrTarget"; @@ -467,14 +310,51 @@ def SoppBrTarget : AsmOperandClass { def sopp_brtarget : Operand<OtherVT> { let EncoderMethod = "getSOPPBrEncoding"; + let DecoderMethod = "decodeSoppBrTarget"; let OperandType = "OPERAND_PCREL"; let ParserMatchClass = SoppBrTarget; } def si_ga : Operand<iPTR>; +def InterpSlotMatchClass : AsmOperandClass { + let Name = "InterpSlot"; + let PredicateMethod = "isInterpSlot"; + let ParserMethod = "parseInterpSlot"; + let RenderMethod = "addImmOperands"; +} + def InterpSlot : Operand<i32> { let PrintMethod = "printInterpSlot"; + let ParserMatchClass = InterpSlotMatchClass; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def AttrMatchClass : AsmOperandClass { + let Name = "Attr"; + let PredicateMethod = "isInterpAttr"; + let ParserMethod = "parseInterpAttr"; + let RenderMethod = "addImmOperands"; +} + +// It appears to be necessary to create a separate operand for this to +// be able to parse attr<num> with no space. +def Attr : Operand<i32> { + let PrintMethod = "printInterpAttr"; + let ParserMatchClass = AttrMatchClass; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def AttrChanMatchClass : AsmOperandClass { + let Name = "AttrChan"; + let PredicateMethod = "isAttrChan"; + let RenderMethod = "addImmOperands"; +} + +def AttrChan : Operand<i32> { + let PrintMethod = "printInterpAttrChan"; + let ParserMatchClass = AttrChanMatchClass; + let OperandType = "OPERAND_IMMEDIATE"; } def SendMsgMatchClass : AsmOperandClass { @@ -484,6 +364,13 @@ def SendMsgMatchClass : AsmOperandClass { let RenderMethod = "addImmOperands"; } +def ExpTgtMatchClass : AsmOperandClass { + let Name = "ExpTgt"; + let PredicateMethod = "isExpTgt"; + let ParserMethod = "parseExpTgt"; + let RenderMethod = "printExpTgt"; +} + def SendMsgImm : Operand<i32> { let PrintMethod = "printSendMsg"; let ParserMatchClass = SendMsgMatchClass; @@ -495,6 +382,11 @@ def SWaitMatchClass : AsmOperandClass { let ParserMethod = "parseSWaitCntOps"; } +def VReg32OrOffClass : AsmOperandClass { + let Name = "VReg32OrOff"; + let ParserMethod = "parseVReg32OrOff"; +} + def WAIT_FLAG : Operand <i32> { let ParserMatchClass = SWaitMatchClass; let PrintMethod = "printWaitFlag"; @@ -503,6 +395,31 @@ def WAIT_FLAG : Operand <i32> { include "SIInstrFormats.td" include "VIInstrFormats.td" +// ===----------------------------------------------------------------------===// +// ExpSrc* Special cases for exp src operands which are printed as +// "off" depending on en operand. +// ===----------------------------------------------------------------------===// + +def ExpSrc0 : RegisterOperand<VGPR_32> { + let PrintMethod = "printExpSrc0"; + let ParserMatchClass = VReg32OrOffClass; +} + +def ExpSrc1 : RegisterOperand<VGPR_32> { + let PrintMethod = "printExpSrc1"; + let ParserMatchClass = VReg32OrOffClass; +} + +def ExpSrc2 : RegisterOperand<VGPR_32> { + let PrintMethod = "printExpSrc2"; + let ParserMatchClass = VReg32OrOffClass; +} + +def ExpSrc3 : RegisterOperand<VGPR_32> { + let PrintMethod = "printExpSrc3"; + let ParserMatchClass = VReg32OrOffClass; +} + class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass { let Name = "Imm"#CName; let PredicateMethod = "is"#CName; @@ -547,16 +464,15 @@ def gds : NamedOperandBit<"GDS", NamedMatchClass<"GDS">>; def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>; def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>; -def smrd_offset : NamedOperandU32<"SMRDOffset", NamedMatchClass<"SMRDOffset">>; -def smrd_literal_offset : NamedOperandU32<"SMRDLiteralOffset", NamedMatchClass<"SMRDLiteralOffset">>; - -def glc : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; +def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>; def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>; def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>; +def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>; +def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>; def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; @@ -572,33 +488,96 @@ def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>; +def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> { + +} + } // End OperandType = "OPERAND_IMMEDIATE" +class KImmMatchClass<int size> : AsmOperandClass { + let Name = "KImmFP"#size; + let PredicateMethod = "isKImmFP"#size; + let ParserMethod = "parseImm"; + let RenderMethod = "addKImmFP"#size#"Operands"; +} + +class kimmOperand<ValueType vt> : Operand<vt> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_KIMM"#vt.Size; + let PrintMethod = "printU"#vt.Size#"ImmOperand"; + let ParserMatchClass = !cast<AsmOperandClass>("KImmFP"#vt.Size#"MatchClass"); +} + +// 32-bit VALU immediate operand that uses the constant bus. +def KImmFP32MatchClass : KImmMatchClass<32>; +def f32kimm : kimmOperand<i32>; + +// 32-bit VALU immediate operand with a 16-bit value that uses the +// constant bus. +def KImmFP16MatchClass : KImmMatchClass<16>; +def f16kimm : kimmOperand<i16>; + def VOPDstS64 : VOPDstOperand <SReg_64>; -def FPInputModsMatchClass : AsmOperandClass { - let Name = "RegOrImmWithFPInputMods"; +class FPInputModsMatchClass <int opSize> : AsmOperandClass { + let Name = "RegOrImmWithFP"#opSize#"InputMods"; let ParserMethod = "parseRegOrImmWithFPInputMods"; - let PredicateMethod = "isRegOrImmWithInputMods"; + let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods"; } +def FP16InputModsMatchClass : FPInputModsMatchClass<16>; +def FP32InputModsMatchClass : FPInputModsMatchClass<32>; +def FP64InputModsMatchClass : FPInputModsMatchClass<64>; -def FPInputMods : Operand <i32> { +class InputMods <AsmOperandClass matchClass> : Operand <i32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_INPUT_MODS"; + let ParserMatchClass = matchClass; +} + +class FPInputMods <FPInputModsMatchClass matchClass> : InputMods <matchClass> { let PrintMethod = "printOperandAndFPInputMods"; - let ParserMatchClass = FPInputModsMatchClass; } -def IntInputModsMatchClass : AsmOperandClass { - let Name = "RegOrImmWithIntInputMods"; +def FP16InputMods : FPInputMods<FP16InputModsMatchClass>; +def FP32InputMods : FPInputMods<FP32InputModsMatchClass>; +def FP64InputMods : FPInputMods<FP64InputModsMatchClass>; + +class IntInputModsMatchClass <int opSize> : AsmOperandClass { + let Name = "RegOrImmWithInt"#opSize#"InputMods"; let ParserMethod = "parseRegOrImmWithIntInputMods"; - let PredicateMethod = "isRegOrImmWithInputMods"; + let PredicateMethod = "isRegOrImmWithInt"#opSize#"InputMods"; +} +def Int32InputModsMatchClass : IntInputModsMatchClass<32>; +def Int64InputModsMatchClass : IntInputModsMatchClass<64>; + +class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> { + let PrintMethod = "printOperandAndIntInputMods"; +} +def Int32InputMods : IntInputMods<Int32InputModsMatchClass>; +def Int64InputMods : IntInputMods<Int64InputModsMatchClass>; + +def FPVRegInputModsMatchClass : AsmOperandClass { + let Name = "VRegWithFPInputMods"; + let ParserMethod = "parseRegWithFPInputMods"; + let PredicateMethod = "isVReg"; } -def IntInputMods: Operand <i32> { +def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> { + let PrintMethod = "printOperandAndFPInputMods"; +} + +def IntVRegInputModsMatchClass : AsmOperandClass { + let Name = "VRegWithIntInputMods"; + let ParserMethod = "parseRegWithIntInputMods"; + let PredicateMethod = "isVReg"; +} + +def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> { let PrintMethod = "printOperandAndIntInputMods"; - let ParserMatchClass = IntInputModsMatchClass; } + //===----------------------------------------------------------------------===// // Complex patterns //===----------------------------------------------------------------------===// @@ -606,24 +585,6 @@ def IntInputMods: Operand <i32> { def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">; def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; -def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; -def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; -def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; -def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">; -def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; -def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; -def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; -def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; -def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">; -def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">; - -def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; -def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; -def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">; -def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; -def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; -def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">; - def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">; def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; @@ -681,455 +642,44 @@ class SIMCInstr <string pseudo, int subtarget> { // EXP classes //===----------------------------------------------------------------------===// -class EXPCommon : InstSI< +class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon< (outs), - (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, - VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3), - "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", - [] > { - - let EXP_CNT = 1; - let Uses = [EXEC]; - let SchedRW = [WriteExport]; -} - -multiclass EXP_m { - - let isPseudo = 1, isCodeGenOnly = 1 in { - def "" : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.NONE> ; - } - - def _si : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.SI>, EXPe { - let DecoderNamespace="SICI"; - let DisableDecoder = DisableSIDecoder; - } - - def _vi : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.VI>, EXPe_vi { - let DecoderNamespace="VI"; - let DisableDecoder = DisableVIDecoder; - } -} - -//===----------------------------------------------------------------------===// -// Scalar classes -//===----------------------------------------------------------------------===// - -class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : - SOP1 <outs, ins, "", pattern>, - SIMCInstr<opName, SIEncodingFamily.NONE> { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> : - SOP1 <outs, ins, asm, []>, - SOP1e <op.SI>, - SIMCInstr<opName, SIEncodingFamily.SI> { - let isCodeGenOnly = 0; - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> : - SOP1 <outs, ins, asm, []>, - SOP1e <op.VI>, - SIMCInstr<opName, SIEncodingFamily.VI> { - let isCodeGenOnly = 0; - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; -} - -multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm, - list<dag> pattern> { - - def "" : SOP1_Pseudo <opName, outs, ins, pattern>; - - def _si : SOP1_Real_si <op, opName, outs, ins, asm>; - - def _vi : SOP1_Real_vi <op, opName, outs, ins, asm>; - -} - -multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m < - op, opName, (outs SReg_32:$sdst), (ins SSrc_32:$src0), - opName#" $sdst, $src0", pattern ->; - -multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m < - op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0), - opName#" $sdst, $src0", pattern ->; - -// no input, 64-bit output. -multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> { - def "" : SOP1_Pseudo <opName, (outs SReg_64:$sdst), (ins), pattern>; - - def _si : SOP1_Real_si <op, opName, (outs SReg_64:$sdst), (ins), - opName#" $sdst"> { - let src0 = 0; - } - - def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$sdst), (ins), - opName#" $sdst"> { - let src0 = 0; - } -} - -// 64-bit input, no output -multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> { - def "" : SOP1_Pseudo <opName, (outs), (ins SReg_64:$src0), pattern>; - - def _si : SOP1_Real_si <op, opName, (outs), (ins SReg_64:$src0), - opName#" $src0"> { - let sdst = 0; - } - - def _vi : SOP1_Real_vi <op, opName, (outs), (ins SReg_64:$src0), - opName#" $src0"> { - let sdst = 0; - } -} - -// 64-bit input, 32-bit output. -multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m < - op, opName, (outs SReg_32:$sdst), (ins SSrc_64:$src0), - opName#" $sdst, $src0", pattern ->; - -// 32-bit input, 64-bit output. -multiclass SOP1_64_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m < - op, opName, (outs SReg_64:$sdst), (ins SSrc_32:$src0), - opName#" $sdst, $src0", pattern ->; - -class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> : - SOP2<outs, ins, "", pattern>, - SIMCInstr<opName, SIEncodingFamily.NONE> { - let isPseudo = 1; - let isCodeGenOnly = 1; - let Size = 4; - - // Pseudo instructions have no encodings, but adding this field here allows - // us to do: - // let sdst = xxx in { - // for multiclasses that include both real and pseudo instructions. - field bits<7> sdst = 0; -} - -class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> : - SOP2<outs, ins, asm, []>, - SOP2e<op.SI>, - SIMCInstr<opName, SIEncodingFamily.SI> { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> : - SOP2<outs, ins, asm, []>, - SOP2e<op.VI>, - SIMCInstr<opName, SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; -} - -multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm, - list<dag> pattern> { - - def "" : SOP2_Pseudo <opName, outs, ins, pattern>; - - def _si : SOP2_Real_si <op, opName, outs, ins, asm>; - - def _vi : SOP2_Real_vi <op, opName, outs, ins, asm>; - -} - -multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < - op, opName, (outs SReg_32:$sdst), (ins SSrc_32:$src0, SSrc_32:$src1), - opName#" $sdst, $src0, $src1", pattern ->; - -multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m < - op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0, SSrc_64:$src1), - opName#" $sdst, $src0, $src1", pattern ->; - -multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < - op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0, SSrc_32:$src1), - opName#" $sdst, $src0, $src1", pattern ->; - -multiclass SOP2_64_32_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < - op, opName, (outs SReg_64:$sdst), (ins SSrc_32:$src0, SSrc_32:$src1), - opName#" $sdst, $src0, $src1", pattern ->; - -class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1, - string opName, list<dag> pattern = []> : SOPC < - op, (outs), (ins rc0:$src0, rc1:$src1), - opName#" $src0, $src1", pattern > { - let Defs = [SCC]; -} -class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, - string opName, PatLeaf cond> : SOPC_Base < - op, rc, rc, opName, - [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > { -} - -class SOPC_CMP_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> - : SOPC_Helper<op, SSrc_32, i32, opName, cond>; - -class SOPC_32<bits<7> op, string opName, list<dag> pattern = []> - : SOPC_Base<op, SSrc_32, SSrc_32, opName, pattern>; - -class SOPC_64_32<bits<7> op, string opName, list<dag> pattern = []> - : SOPC_Base<op, SSrc_64, SSrc_32, opName, pattern>; - -class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : - SOPK <outs, ins, "", pattern>, - SIMCInstr<opName, SIEncodingFamily.NONE> { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> : - SOPK <outs, ins, asm, []>, - SOPKe <op.SI>, - SIMCInstr<opName, SIEncodingFamily.SI> { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; - let isCodeGenOnly = 0; -} - -class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> : - SOPK <outs, ins, asm, []>, - SOPKe <op.VI>, - SIMCInstr<opName, SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; - let isCodeGenOnly = 0; -} - -multiclass SOPK_m <sopk op, string opName, dag outs, dag ins, string opAsm, - string asm = opName#opAsm> { - def "" : SOPK_Pseudo <opName, outs, ins, []>; - - def _si : SOPK_Real_si <op, opName, outs, ins, asm>; - - def _vi : SOPK_Real_vi <op, opName, outs, ins, asm>; - -} - -multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> { - def "" : SOPK_Pseudo <opName, (outs SReg_32:$sdst), (ins u16imm:$simm16), - pattern>; - - def _si : SOPK_Real_si <op, opName, (outs SReg_32:$sdst), (ins u16imm:$simm16), - opName#" $sdst, $simm16">; - - def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$sdst), (ins u16imm:$simm16), - opName#" $sdst, $simm16">; -} - -multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> { - def "" : SOPK_Pseudo <opName, (outs), - (ins SReg_32:$src0, u16imm:$src1), pattern> { - let Defs = [SCC]; - } - - - def _si : SOPK_Real_si <op, opName, (outs), - (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> { - let Defs = [SCC]; - } - - def _vi : SOPK_Real_vi <op, opName, (outs), - (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> { - let Defs = [SCC]; - } -} - -multiclass SOPK_32TIE <sopk op, string opName, list<dag> pattern> : SOPK_m < - op, opName, (outs SReg_32:$sdst), (ins SReg_32:$src0, u16imm:$simm16), - " $sdst, $simm16" ->; - -multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins, - string argAsm, string asm = opName#argAsm> { - - def "" : SOPK_Pseudo <opName, outs, ins, []>; - - def _si : SOPK <outs, ins, asm, []>, - SOPK64e <op.SI>, - SIMCInstr<opName, SIEncodingFamily.SI> { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; - let isCodeGenOnly = 0; - } - - def _vi : SOPK <outs, ins, asm, []>, - SOPK64e <op.VI>, - SIMCInstr<opName, SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; - let isCodeGenOnly = 0; - } -} -//===----------------------------------------------------------------------===// -// SMRD classes -//===----------------------------------------------------------------------===// - -class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : - SMRD <outs, ins, "", pattern>, - SIMCInstr<opName, SIEncodingFamily.NONE> { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class SMRD_IMM_Real_si <bits<5> op, string opName, dag outs, dag ins, - string asm> : - SMRD <outs, ins, asm, []>, - SMRD_IMMe <op>, - SIMCInstr<opName, SIEncodingFamily.SI> { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class SMRD_SOFF_Real_si <bits<5> op, string opName, dag outs, dag ins, - string asm> : - SMRD <outs, ins, asm, []>, - SMRD_SOFFe <op>, - SIMCInstr<opName, SIEncodingFamily.SI> { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; -} - - -class SMRD_IMM_Real_vi <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pattern = []> : - SMRD <outs, ins, asm, pattern>, - SMEM_IMMe_vi <op>, - SIMCInstr<opName, SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; -} - -class SMRD_SOFF_Real_vi <bits<8> op, string opName, dag outs, dag ins, - string asm, list<dag> pattern = []> : - SMRD <outs, ins, asm, pattern>, - SMEM_SOFFe_vi <op>, - SIMCInstr<opName, SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; -} - - -multiclass SMRD_IMM_m <smrd op, string opName, dag outs, dag ins, - string asm, list<dag> pattern> { - - def "" : SMRD_Pseudo <opName, outs, ins, pattern>; - - def _si : SMRD_IMM_Real_si <op.SI, opName, outs, ins, asm>; - - // glc is only applicable to scalar stores, which are not yet - // implemented. - let glc = 0 in { - def _vi : SMRD_IMM_Real_vi <op.VI, opName, outs, ins, asm>; - } -} - -multiclass SMRD_SOFF_m <smrd op, string opName, dag outs, dag ins, - string asm, list<dag> pattern> { - - def "" : SMRD_Pseudo <opName, outs, ins, pattern>; - - def _si : SMRD_SOFF_Real_si <op.SI, opName, outs, ins, asm>; - - // glc is only applicable to scalar stores, which are not yet - // implemented. - let glc = 0 in { - def _vi : SMRD_SOFF_Real_vi <op.VI, opName, outs, ins, asm>; - } -} - -multiclass SMRD_Special <smrd op, string opName, dag outs, - int sdst_ = ?, - string opStr = "", - list<dag> pattern = []> { - let hasSideEffects = 1 in { - def "" : SMRD_Pseudo <opName, outs, (ins), pattern>; + (ins exp_tgt:$tgt, + ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3, + exp_vm:$vm, exp_compr:$compr, i8imm:$en), + "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm", + [(node (i8 timm:$en), (i1 timm:$vm), (i8 timm:$tgt), (i1 timm:$compr), + f32:$src0, f32:$src1, f32:$src2, f32:$src3)]> { + let AsmMatchConverter = "cvtExp"; +} + +// Split EXP instruction into EXP and EXP_DONE so we can set +// mayLoad for done=1. +multiclass EXP_m<bit done, SDPatternOperator node> { + let mayLoad = done in { + let isPseudo = 1, isCodeGenOnly = 1 in { + def "" : EXP_Helper<done, node>, + SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>; + } - let sbase = 0, soff = 0, sdst = sdst_ in { - def _si : SMRD_SOFF_Real_si <op.SI, opName, outs, (ins), opName#opStr>; + let done = done in { + def _si : EXP_Helper<done>, + SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>, + EXPe { + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; + } - let glc = 0 in { - def _vi : SMRD_SOFF_Real_vi <op.VI, opName, outs, (ins), opName#opStr>; + def _vi : EXP_Helper<done>, + SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>, + EXPe_vi { + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } } } } -multiclass SMRD_Inval <smrd op, string opName, - SDPatternOperator node> { - let mayStore = 1 in { - defm : SMRD_Special<op, opName, (outs), 0, "", [(node)]>; - } -} - -class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> : - SMRD_SOFF_Real_vi<op, opName, (outs), (ins), opName, [(node)]> { - let hasSideEffects = 1; - let mayStore = 1; - let sbase = 0; - let sdst = 0; - let glc = 0; - let soff = 0; -} - -class SMEM_Ret <bits<8> op, string opName, SDPatternOperator node> : - SMRD_SOFF_Real_vi<op, opName, (outs SReg_64:$sdst), (ins), - opName#" $sdst", [(set i64:$sdst, (node))]> { - let hasSideEffects = 1; - let mayStore = ?; - let mayLoad = ?; - let sbase = 0; - let glc = 0; - let soff = 0; -} - -multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass, - RegisterClass dstClass> { - defm _IMM : SMRD_IMM_m < - op, opName#"_IMM", (outs dstClass:$sdst), - (ins baseClass:$sbase, smrd_offset:$offset), - opName#" $sdst, $sbase, $offset", [] - >; - - def _IMM_ci : SMRD < - (outs dstClass:$sdst), (ins baseClass:$sbase, smrd_literal_offset:$offset), - opName#" $sdst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> { - let AssemblerPredicates = [isCIOnly]; - let DecoderNamespace = "CI"; - } - - defm _SGPR : SMRD_SOFF_m < - op, opName#"_SGPR", (outs dstClass:$sdst), - (ins baseClass:$sbase, SReg_32:$soff), - opName#" $sdst, $sbase, $soff", [] - >; -} - //===----------------------------------------------------------------------===// // Vector ALU classes //===----------------------------------------------------------------------===// @@ -1146,43 +696,99 @@ class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> { // instructions for the given VT. class getVALUDstForVT<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, - !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, - !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>, - VOPDstOperand<SReg_64>))); // else VT == i1 + !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>, + !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, + !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>, + VOPDstOperand<SReg_64>)))); // else VT == i1 } // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT> { - RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32); + bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, f32.Value), 1, + !if(!eq(VT.Value, f64.Value), 1, + 0))); + RegisterOperand ret = !if(isFP, + !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)), + !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32))); } // Returns the vreg register class to use for source operand given VT class getVregSrcForVT<ValueType VT> { - RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32); + RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128, + !if(!eq(VT.Size, 64), VReg_64, VGPR_32)); } // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT<ValueType VT> { + bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, f32.Value), 1, + !if(!eq(VT.Value, f64.Value), 1, + 0))); RegisterOperand ret = - !if(!eq(VT.Size, 64), - VCSrc_64, - !if(!eq(VT.Value, i1.Value), - SCSrc_64, - VCSrc_32 - ) - ); + !if(!eq(VT.Size, 128), + VSrc_128, + !if(!eq(VT.Size, 64), + !if(isFP, + VCSrc_f64, + VCSrc_b64), + !if(!eq(VT.Value, i1.Value), + SCSrc_b64, + !if(isFP, + !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32), + !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32) + ) + ) + ) + ); } // Returns 1 if the source arguments have modifiers, 0 if they do not. // XXX - do f16 instructions? -class hasModifiers<ValueType SrcVT> { +class isFloatType<ValueType SrcVT> { bit ret = + !if(!eq(SrcVT.Value, f16.Value), 1, !if(!eq(SrcVT.Value, f32.Value), 1, !if(!eq(SrcVT.Value, f64.Value), 1, - 0)); + 0))); +} + +class isIntType<ValueType SrcVT> { + bit ret = + !if(!eq(SrcVT.Value, i16.Value), 1, + !if(!eq(SrcVT.Value, i32.Value), 1, + !if(!eq(SrcVT.Value, i64.Value), 1, + 0))); +} + + +// Return type of input modifiers operand for specified input operand +class getSrcMod <ValueType VT> { + bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, f32.Value), 1, + !if(!eq(VT.Value, f64.Value), 1, + 0))); + Operand ret = !if(!eq(VT.Size, 64), + !if(isFP, FP64InputMods, Int64InputMods), + !if(isFP, + !if(!eq(VT.Value, f16.Value), + FP16InputMods, + FP32InputMods + ), + Int32InputMods) + ); +} + +// Return type of input modifiers operand specified input operand for SDWA/DPP +class getSrcModExt <ValueType VT> { + bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, f32.Value), 1, + !if(!eq(VT.Value, f64.Value), 1, + 0))); + Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); } // Returns the input arguments for VOP[12C] instructions for the given SrcVT. @@ -1195,7 +801,8 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { // Returns the input arguments for VOP3 instructions for the given SrcVT. class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, RegisterOperand Src2RC, int NumSrcArgs, - bit HasModifiers> { + bit HasModifiers, Operand Src0Mod, Operand Src1Mod, + Operand Src2Mod> { dag ret = !if (!eq(NumSrcArgs, 0), @@ -1205,7 +812,7 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, !if (!eq(NumSrcArgs, 1), !if (!eq(HasModifiers, 1), // VOP1 with modifiers - (ins FPInputMods:$src0_modifiers, Src0RC:$src0, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod:$clamp, omod:$omod) /* else */, // VOP1 without modifiers @@ -1214,8 +821,8 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, !if (!eq(NumSrcArgs, 2), !if (!eq(HasModifiers, 1), // VOP 2 with modifiers - (ins FPInputMods:$src0_modifiers, Src0RC:$src0, - FPInputMods:$src1_modifiers, Src1RC:$src1, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, clampmod:$clamp, omod:$omod) /* else */, // VOP2 without modifiers @@ -1224,9 +831,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, /* NumSrcArgs == 3 */, !if (!eq(HasModifiers, 1), // VOP3 with modifiers - (ins FPInputMods:$src0_modifiers, Src0RC:$src0, - FPInputMods:$src1_modifiers, Src1RC:$src1, - FPInputMods:$src2_modifiers, Src2RC:$src2, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, clampmod:$clamp, omod:$omod) /* else */, // VOP3 without modifiers @@ -1235,7 +842,7 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, } class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, - bit HasModifiers> { + bit HasModifiers, Operand Src0Mod, Operand Src1Mod> { dag ret = !if (!eq(NumSrcArgs, 0), // VOP1 without input operands (V_NOP) @@ -1244,7 +851,7 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, !if (!eq(NumSrcArgs, 1), !if (!eq(HasModifiers, 1), // VOP1_DPP with modifiers - (ins FPInputMods:$src0_modifiers, Src0RC:$src0, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) /* else */, @@ -1255,8 +862,8 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, /* NumSrcArgs == 2 */, !if (!eq(HasModifiers, 1), // VOP2_DPP with modifiers - (ins FPInputMods:$src0_modifiers, Src0RC:$src0, - FPInputMods:$src1_modifiers, Src1RC:$src1, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) /* else */, @@ -1268,49 +875,28 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, } class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, - bit HasFloatModifiers, ValueType DstVT> { + bit HasFloatModifiers, Operand Src0Mod, Operand Src1Mod, + ValueType DstVT> { dag ret = !if(!eq(NumSrcArgs, 0), // VOP1 without input operands (V_NOP) (ins), !if(!eq(NumSrcArgs, 1), - !if(HasFloatModifiers, - // VOP1_SDWA with float modifiers - (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel) - /* else */, - // VOP1_SDWA with sext modifier - (ins IntInputMods:$src0_imodifiers, Src0RC:$src0, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel) - /* endif */) - /* NumSrcArgs == 2 */, - !if(HasFloatModifiers, - !if(!eq(DstVT.Size, 1), - // VOPC_SDWA with float modifiers - (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0, - FPInputMods:$src1_fmodifiers, Src1RC:$src1, - clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), - // VOP2_SDWA or VOPC_SDWA with float modifiers - (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0, - FPInputMods:$src1_fmodifiers, Src1RC:$src1, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel, src1_sel:$src1_sel) - ), - /* else */ - !if(!eq(DstVT.Size, 1), - // VOPC_SDWA with sext modifiers - (ins IntInputMods:$src0_imodifiers, Src0RC:$src0, - IntInputMods:$src1_imodifiers, Src1RC:$src1, - clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), - // VOP2_SDWA or VOPC_SDWA with sext modifier - (ins IntInputMods:$src0_imodifiers, Src0RC:$src0, - IntInputMods:$src1_imodifiers, Src1RC:$src1, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel, src1_sel:$src1_sel) - ) - /* endif */))); + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel), + !if(!eq(NumSrcArgs, 2), + !if(!eq(DstVT.Size, 1), + // VOPC_SDWA with modifiers + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP2_SDWA or VOPC_SDWA with modifiers + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel)), + (ins)/* endif */))); } // Outs for DPP and SDWA @@ -1374,8 +960,8 @@ class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers, " vcc", // use vcc token as dst for VOPC instructioins "$vdst"), ""); - string src0 = !if(HasFloatModifiers, "$src0_fmodifiers", "$src0_imodifiers"); - string src1 = !if(HasFloatModifiers, "$src1_fmodifiers", "$src1_imodifiers"); + string src0 = "$src0_modifiers"; + string src1 = "$src1_modifiers"; string args = !if(!eq(NumSrcArgs, 0), "", !if(!eq(NumSrcArgs, 1), @@ -1414,6 +1000,14 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, ); } +class BitOr<bit a, bit b> { + bit ret = !if(a, 1, !if(b, 1, 0)); +} + +class BitAnd<bit a, bit b> { + bit ret = !if(a, !if(b, 1, 0), 0); +} + class VOPProfile <list<ValueType> _ArgVT> { field list<ValueType> ArgVT = _ArgVT; @@ -1434,11 +1028,41 @@ class VOPProfile <list<ValueType> _ArgVT> { field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret; field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret; field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret; + field Operand Src0Mod = getSrcMod<Src0VT>.ret; + field Operand Src1Mod = getSrcMod<Src1VT>.ret; + field Operand Src2Mod = getSrcMod<Src2VT>.ret; + field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret; + field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret; + field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret; + field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret; + field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); field bit HasDst32 = HasDst; + field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret; - field bit HasModifiers = hasModifiers<Src0VT>.ret; + field bit HasSrc0 = !if(!eq(Src0VT.Value, untyped.Value), 0, 1); + field bit HasSrc1 = !if(!eq(Src1VT.Value, untyped.Value), 0, 1); + field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1); + + // TODO: Modifiers logic is somewhat adhoc here, to be refined later + field bit HasModifiers = isFloatType<Src0VT>.ret; + + field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret; + field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret; + field bit HasSrc2FloatMods = isFloatType<Src2VT>.ret; + + field bit HasSrc0IntMods = isIntType<Src0VT>.ret; + field bit HasSrc1IntMods = isIntType<Src1VT>.ret; + field bit HasSrc2IntMods = isIntType<Src2VT>.ret; + + field bit HasSrc0Mods = HasModifiers; + field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0); + field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0); + + field bit HasOMod = HasModifiers; + field bit HasClamp = HasModifiers; + field bit HasSDWAClamp = HasSrc0; field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; @@ -1449,13 +1073,16 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag Outs32 = Outs; field dag Outs64 = Outs; field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret; - field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCDPP>.ret; + field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret; field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, - HasModifiers>.ret; - field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs, HasModifiers>.ret; - field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, HasModifiers, DstVT>.ret; + HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret; + field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs, + HasModifiers, Src0ModDPP, Src1ModDPP>.ret; + field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, + HasModifiers, Src0ModSDWA, Src1ModSDWA, + DstVT>.ret; field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret; field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; @@ -1467,14 +1094,13 @@ class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { let HasExt = 0; } -// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order -// for the instruction patterns to work. def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; -def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>; -def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>; +def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>; +def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; -def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>; +def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; +def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>; @@ -1492,6 +1118,7 @@ def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; +def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>; def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>; def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>; @@ -1500,181 +1127,21 @@ def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; -// Write out to vcc or arbitrary SGPR. -def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { - let Asm32 = "$vdst, vcc, $src0, $src1"; - let Asm64 = "$vdst, $sdst, $src0, $src1"; - let Outs32 = (outs DstRC:$vdst); - let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); -} - -// Write out to vcc or arbitrary SGPR and read in from vcc or -// arbitrary SGPR. -def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { - // We use VCSrc_32 to exclude literal constants, even though the - // encoding normally allows them since the implicit VCC use means - // using one would always violate the constant bus - // restriction. SGPRs are still allowed because it should - // technically be possible to use VCC again as src0. - let Src0RC32 = VCSrc_32; - let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; - let Asm64 = "$vdst, $sdst, $src0, $src1, $src2"; - let Outs32 = (outs DstRC:$vdst); - let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); - - // Suppress src2 implied by type since the 32-bit encoding uses an - // implicit VCC use. - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); -} - -// Read in from vcc or arbitrary SGPR -def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { - let Src0RC32 = VCSrc_32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above. - let Asm32 = "$vdst, $src0, $src1, vcc"; - let Asm64 = "$vdst, $src0, $src1, $src2"; - let Outs32 = (outs DstRC:$vdst); - let Outs64 = (outs DstRC:$vdst); - - // Suppress src2 implied by type since the 32-bit encoding uses an - // implicit VCC use. - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); -} - -class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { - let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); - let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod"; -} - -def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> { - // FIXME: Hack to stop printing _e64 - let DstRC = RegisterOperand<VGPR_32>; -} - -def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> { - // FIXME: Hack to stop printing _e64 - let DstRC = RegisterOperand<VReg_64>; -} - -// VOPC instructions are a special case because for the 32-bit -// encoding, we want to display the implicit vcc write as if it were -// an explicit $dst. -class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> { - let Asm32 = "vcc, $src0, $src1"; - // The destination for 32-bit encoding is implicit. - let HasDst32 = 0; - let Outs64 = (outs DstRC:$sdst); -} - -class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> { - let Ins64 = (ins FPInputMods:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = "$sdst, $src0_modifiers, $src1"; - let InsSDWA = (ins FPInputMods:$src0_fmodifiers, Src0RC64:$src0, - IntInputMods:$src1_imodifiers, Src1RC64:$src1, - clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel); - let AsmSDWA = " vcc, $src0_fmodifiers, $src1_imodifiers$clamp $src0_sel $src1_sel"; - -} - -def VOPC_I1_F32_F32 : VOPC_Profile<f32>; -def VOPC_I1_F64_F64 : VOPC_Profile<f64>; -def VOPC_I1_I32_I32 : VOPC_Profile<i32>; -def VOPC_I1_I64_I64 : VOPC_Profile<i64>; - -def VOPC_I1_F32_I32 : VOPC_Class_Profile<f32>; -def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>; - def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; +def VOP_F16_F32_F16_F32 : VOPProfile <[f16, f32, f16, f32]>; +def VOP_F32_F32_F16_F16 : VOPProfile <[f32, f32, f16, f16]>; def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; -def VOP_MADAK : VOPProfile <[f32, f32, f32, f32]> { - field dag Ins32 = (ins VCSrc_32:$src0, VGPR_32:$src1, u32kimm:$imm); - field string Asm32 = "$vdst, $src0, $src1, $imm"; - field bit HasExt = 0; -} -def VOP_MADMK : VOPProfile <[f32, f32, f32, f32]> { - field dag Ins32 = (ins VCSrc_32:$src0, u32kimm:$imm, VGPR_32:$src1); - field string Asm32 = "$vdst, $src0, $imm, $src1"; - field bit HasExt = 0; -} -def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); - let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, - HasModifiers>.ret; - let InsDPP = (ins FPInputMods:$src0_modifiers, Src0RC32:$src0, - FPInputMods:$src1_modifiers, Src1RC32:$src1, - VGPR_32:$src2, // stub argument - dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, - bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); - let InsSDWA = (ins FPInputMods:$src0_fmodifiers, Src0RC32:$src0, - FPInputMods:$src1_fmodifiers, Src1RC32:$src1, - VGPR_32:$src2, // stub argument - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel, src1_sel:$src1_sel); - let Asm32 = getAsm32<1, 2, f32>.ret; - let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret; - let AsmDPP = getAsmDPP<1, 2, HasModifiers, f32>.ret; - let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, f32>.ret; -} def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; +def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>; +def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>; +def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>; -// This class is used only with VOPC instructions. Use $sdst for out operand -class SIInstAlias <string asm, Instruction inst, VOPProfile p> : - InstAlias <asm, (inst)>, PredicateControl { - - field bit isCompare; - field bit isCommutable; - - let ResultInst = - !if (p.HasDst32, - !if (!eq(p.NumSrcArgs, 0), - // 1 dst, 0 src - (inst p.DstRC:$sdst), - !if (!eq(p.NumSrcArgs, 1), - // 1 dst, 1 src - (inst p.DstRC:$sdst, p.Src0RC32:$src0), - !if (!eq(p.NumSrcArgs, 2), - // 1 dst, 2 src - (inst p.DstRC:$sdst, p.Src0RC32:$src0, p.Src1RC32:$src1), - // else - unreachable - (inst)))), - // else - !if (!eq(p.NumSrcArgs, 2), - // 0 dst, 2 src - (inst p.Src0RC32:$src0, p.Src1RC32:$src1), - !if (!eq(p.NumSrcArgs, 1), - // 0 dst, 1 src - (inst p.Src0RC32:$src1), - // else - // 0 dst, 0 src - (inst)))); -} - -class SIInstAliasSI <string asm, string op_name, VOPProfile p> : - SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_si"), p> { - let AssemblerPredicate = SIAssemblerPredicate; -} - -class SIInstAliasVI <string asm, string op_name, VOPProfile p> : - SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_vi"), p> { - let AssemblerPredicates = [isVI]; -} - -multiclass SIInstAliasBuilder <string asm, VOPProfile p> { - - def : SIInstAliasSI <asm, NAME, p>; - - def : SIInstAliasVI <asm, NAME, p>; -} - -class VOP <string opName> { - string OpName = opName; -} - -class VOP2_REV <string revOp, bit isOrig> { +class Commutable_REV <string revOp, bit isOrig> { string RevOp = revOp; bit IsOrig = isOrig; } @@ -1684,832 +1151,6 @@ class AtomicNoRet <string noRetOp, bit isRet> { bit IsRet = isRet; } -class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : - VOP1Common <outs, ins, "", pattern>, - VOP <opName>, - SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>, - MnemonicAlias<opName#"_e32", opName> { - let isPseudo = 1; - let isCodeGenOnly = 1; - - field bits<8> vdst; - field bits<9> src0; -} - -class VOP1_Real_si <string opName, vop1 op, dag outs, dag ins, string asm> : - VOP1<op.SI, outs, ins, asm, []>, - SIMCInstr <opName#"_e32", SIEncodingFamily.SI> { - let AssemblerPredicate = SIAssemblerPredicate; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> : - VOP1<op.VI, outs, ins, asm, []>, - SIMCInstr <opName#"_e32", SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; -} - -multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, - string asm = opName#p.Asm32> { - def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>; - - def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>; - - def _vi : VOP1_Real_vi <opName, op, p.Outs, p.Ins32, asm>; - -} - -class VOP1_DPP <vop1 op, string opName, VOPProfile p> : - VOP1_DPPe <op.VI>, - VOP_DPP <p.OutsDPP, p.InsDPP, opName#p.AsmDPP, [], p.HasModifiers> { - let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); - let DecoderNamespace = "DPP"; - let DisableDecoder = DisableVIDecoder; - let src0_modifiers = !if(p.HasModifiers, ?, 0); - let src1_modifiers = 0; -} - -class SDWADisableFields <VOPProfile p> { - bits<8> src0 = !if(!eq(p.NumSrcArgs, 0), 0, ?); - bits<3> src0_sel = !if(!eq(p.NumSrcArgs, 0), 6, ?); - bits<2> src0_fmodifiers = !if(!eq(p.NumSrcArgs, 0), - 0, - !if(p.HasModifiers, ?, 0)); - bits<1> src0_imodifiers = !if(!eq(p.NumSrcArgs, 0), - 0, - !if(p.HasModifiers, 0, ?)); - bits<3> src1_sel = !if(!eq(p.NumSrcArgs, 0), 6, - !if(!eq(p.NumSrcArgs, 1), 6, - ?)); - bits<2> src1_fmodifiers = !if(!eq(p.NumSrcArgs, 0), 0, - !if(!eq(p.NumSrcArgs, 1), 0, - !if(p.HasModifiers, ?, 0))); - bits<1> src1_imodifiers = !if(!eq(p.NumSrcArgs, 0), 0, - !if(!eq(p.NumSrcArgs, 1), 0, - !if(p.HasModifiers, 0, ?))); - bits<3> dst_sel = !if(p.HasDst, ?, 6); - bits<2> dst_unused = !if(p.HasDst, ?, 2); - bits<1> clamp = !if(!eq(p.NumSrcArgs, 0), 0, ?); -} - -class VOP1_SDWA <vop1 op, string opName, VOPProfile p> : - VOP1_SDWAe <op.VI>, - VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>, - SDWADisableFields <p> { - let AsmMatchConverter = "cvtSdwaVOP1"; - let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); - let DecoderNamespace = "SDWA"; - let DisableDecoder = DisableVIDecoder; -} - -multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, - string asm = opName#p.Asm32> { - - def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>; - - def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>; -} - -class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : - VOP2Common <outs, ins, "", pattern>, - VOP <opName>, - SIMCInstr<opName#"_e32", SIEncodingFamily.NONE>, - MnemonicAlias<opName#"_e32", opName> { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class VOP2_Real_si <string opName, vop2 op, dag outs, dag ins, string asm> : - VOP2 <op.SI, outs, ins, opName#asm, []>, - SIMCInstr <opName#"_e32", SIEncodingFamily.SI> { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> : - VOP2 <op.VI, outs, ins, opName#asm, []>, - SIMCInstr <opName#"_e32", SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; -} - -multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern, - string revOp> { - - def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>, - VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - - def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>; -} - -multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern, - string revOp> { - - def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>, - VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - - def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>; - - def _vi : VOP2_Real_vi <opName, op, p.Outs32, p.Ins32, p.Asm32>; - -} - -class VOP2_DPP <vop2 op, string opName, VOPProfile p> : - VOP2_DPPe <op.VI>, - VOP_DPP <p.OutsDPP, p.InsDPP, opName#p.AsmDPP, [], p.HasModifiers> { - let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); - let DecoderNamespace = "DPP"; - let DisableDecoder = DisableVIDecoder; - let src0_modifiers = !if(p.HasModifiers, ?, 0); - let src1_modifiers = !if(p.HasModifiers, ?, 0); -} - -class VOP2_SDWA <vop2 op, string opName, VOPProfile p> : - VOP2_SDWAe <op.VI>, - VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>, - SDWADisableFields <p> { - let AsmMatchConverter = "cvtSdwaVOP2"; - let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); - let DecoderNamespace = "SDWA"; - let DisableDecoder = DisableVIDecoder; -} - -class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> { - - bits<2> src0_modifiers = !if(HasModifiers, ?, 0); - bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0); - bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0); - bits<2> omod = !if(HasModifiers, ?, 0); - bits<1> clamp = !if(HasModifiers, ?, 0); - bits<9> src1 = !if(HasSrc1, ?, 0); - bits<9> src2 = !if(HasSrc2, ?, 0); -} - -class VOP3DisableModFields <bit HasSrc0Mods, - bit HasSrc1Mods = 0, - bit HasSrc2Mods = 0, - bit HasOutputMods = 0> { - bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0); - bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0); - bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0); - bits<2> omod = !if(HasOutputMods, ?, 0); - bits<1> clamp = !if(HasOutputMods, ?, 0); -} - -class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName, - bit HasMods = 0, bit VOP3Only = 0> : - VOP3Common <outs, ins, "", pattern, HasMods, VOP3Only>, - VOP <opName>, - SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>, - MnemonicAlias<opName#"_e64", opName> { - let isPseudo = 1; - let isCodeGenOnly = 1; - - field bit vdst; - field bit src0; -} - -class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName, - bit HasMods = 0, bit VOP3Only = 0> : - VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, - VOP3e <op>, - SIMCInstr<opName#"_e64", SIEncodingFamily.SI> { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName, - bit HasMods = 0, bit VOP3Only = 0> : - VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, - VOP3e_vi <op>, - SIMCInstr <opName#"_e64", SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; -} - -class VOP3_C_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName, - bit HasMods = 0, bit VOP3Only = 0> : - VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, - VOP3ce <op>, - SIMCInstr<opName#"_e64", SIEncodingFamily.SI> { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class VOP3_C_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName, - bit HasMods = 0, bit VOP3Only = 0> : - VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, - VOP3ce_vi <op>, - SIMCInstr <opName#"_e64", SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; -} - -class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName, - bit HasMods = 0, bit VOP3Only = 0> : - VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, - VOP3be <op>, - SIMCInstr<opName#"_e64", SIEncodingFamily.SI> { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName, - bit HasMods = 0, bit VOP3Only = 0> : - VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, - VOP3be_vi <op>, - SIMCInstr <opName#"_e64", SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; -} - -class VOP3e_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName, - bit HasMods = 0, bit VOP3Only = 0> : - VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, - VOP3e <op>, - SIMCInstr<opName#"_e64", SIEncodingFamily.SI> { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class VOP3e_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName, - bit HasMods = 0, bit VOP3Only = 0> : - VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, - VOP3e_vi <op>, - SIMCInstr <opName#"_e64", SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; -} - -multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, int NumSrcArgs, bit HasMods = 1, bit VOP3Only = 0> { - - def "" : VOP3_Pseudo <outs, ins, pattern, opName>; - - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>, - VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), - !if(!eq(NumSrcArgs, 2), 0, 1), - HasMods>; - def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>, - VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), - !if(!eq(NumSrcArgs, 2), 0, 1), - HasMods>; -} - -multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, bit HasMods = 1> { - - def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>; - - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>, - VOP3DisableFields<0, 0, HasMods>; - - def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>, - VOP3DisableFields<0, 0, HasMods>; -} - -multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, bit HasMods = 1> { - - def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>; - - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>, - VOP3DisableFields<0, 0, HasMods>; - // No VI instruction. This class is for SI only. -} - -multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, string revOp, - bit HasMods = 1> { - - def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>, - VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; - - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>, - VOP3DisableFields<1, 0, HasMods>; - - def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>, - VOP3DisableFields<1, 0, HasMods>; -} - -multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, string revOp, - bit HasMods = 1> { - - def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>, - VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; - - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>, - VOP3DisableFields<1, 0, HasMods>; - - // No VI instruction. This class is for SI only. -} - -// Two operand VOP3b instruction that may have a 3rd SGPR bool operand -// instead of an implicit VCC as in the VOP2b format. -multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit useSrc2Input = 0, bit VOP3Only = 0> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods, VOP3Only>; - - def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>, - VOP3DisableFields<1, useSrc2Input, HasMods>; - - def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>, - VOP3DisableFields<1, useSrc2Input, HasMods>; -} - -// Same as VOP3b_2_3_m but no 2nd destination (sdst), e.g. v_cndmask_b32. -multiclass VOP3e_2_3_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit useSrc2Input = 0, bit VOP3Only = 0> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods, VOP3Only>; - - def _si : VOP3e_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>, - VOP3DisableFields<1, useSrc2Input, HasMods>; - - def _vi : VOP3e_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>, - VOP3DisableFields<1, useSrc2Input, HasMods>; -} - -multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, - bit HasMods, bit defExec, - string revOp, list<SchedReadWrite> sched> { - - def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>, - VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { - let Defs = !if(defExec, [EXEC], []); - let SchedRW = sched; - } - - def _si : VOP3_C_Real_si <op.SI3, outs, ins, asm, opName, HasMods>, - VOP3DisableFields<1, 0, HasMods> { - let Defs = !if(defExec, [EXEC], []); - let SchedRW = sched; - } - - def _vi : VOP3_C_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>, - VOP3DisableFields<1, 0, HasMods> { - let Defs = !if(defExec, [EXEC], []); - let SchedRW = sched; - } -} - -// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers. -multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins, - string asm, list<dag> pattern = []> { - let isPseudo = 1, isCodeGenOnly = 1 in { - def "" : VOPAnyCommon <outs, ins, "", pattern>, - SIMCInstr<opName, SIEncodingFamily.NONE>; - } - - def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>, - SIMCInstr <opName, SIEncodingFamily.SI> { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; - } - - def _vi : VOP3Common <outs, ins, asm, []>, - VOP3e_vi <op.VI3>, - VOP3DisableFields <1, 0, 0>, - SIMCInstr <opName, SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; - } -} - -multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32, - list<dag> pat64> { - - defm _e32 : VOP1_m <op, opName, p, pat32>; - - defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, - p.HasModifiers>; - - def _dpp : VOP1_DPP <op, opName, p>; - - def _sdwa : VOP1_SDWA <op, opName, p>; -} - -multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, - SDPatternOperator node = null_frag> : VOP1_Helper < - op, opName, P, [], - !if(P.HasModifiers, - [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, - i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]) ->; - -multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, - SDPatternOperator node = null_frag> { - - defm _e32 : VOP1SI_m <op, opName, P, []>; - - defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64, - !if(P.HasModifiers, - [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, - i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]), - opName, P.HasModifiers>; -} - -multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32, - list<dag> pat64, string revOp> { - - defm _e32 : VOP2_m <op, opName, p, pat32, revOp>; - - defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, - revOp, p.HasModifiers>; - - def _dpp : VOP2_DPP <op, opName, p>; - - def _sdwa : VOP2_SDWA <op, opName, p>; -} - -multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, - SDPatternOperator node = null_frag, - string revOp = opName> : VOP2_Helper < - op, opName, P, [], - !if(P.HasModifiers, - [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp ->; - -multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, - SDPatternOperator node = null_frag, - string revOp = opName> { - - defm _e32 : VOP2SI_m <op, opName, P, [], revOp>; - - defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64, - !if(P.HasModifiers, - [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - opName, revOp, P.HasModifiers>; -} - -multiclass VOP2e_Helper <vop2 op, string opName, VOPProfile p, - list<dag> pat32, list<dag> pat64, - string revOp, bit useSGPRInput> { - - let SchedRW = [Write32Bit] in { - let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in { - defm _e32 : VOP2_m <op, opName, p, pat32, revOp>; - } - - defm _e64 : VOP3e_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64, - opName, revOp, p.HasModifiers, useSGPRInput>; - } -} - -multiclass VOP2eInst <vop2 op, string opName, VOPProfile P, - SDPatternOperator node = null_frag, - string revOp = opName> : VOP2e_Helper < - op, opName, P, [], - !if(P.HasModifiers, - [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, !eq(P.NumSrcArgs, 3) ->; - -multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p, - list<dag> pat32, list<dag> pat64, - string revOp, bit useSGPRInput> { - - let SchedRW = [Write32Bit, WriteSALU] in { - let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { - defm _e32 : VOP2_m <op, opName, p, pat32, revOp>; - } - - defm _e64 : VOP3b_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64, - opName, revOp, p.HasModifiers, useSGPRInput>; - } -} - -multiclass VOP2bInst <vop2 op, string opName, VOPProfile P, - SDPatternOperator node = null_frag, - string revOp = opName> : VOP2b_Helper < - op, opName, P, [], - !if(P.HasModifiers, - [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, !eq(P.NumSrcArgs, 3) ->; - -// A VOP2 instruction that is VOP3-only on VI. -multiclass VOP2_VI3_Helper <vop23 op, string opName, VOPProfile p, - list<dag> pat32, list<dag> pat64, string revOp> { - - defm _e32 : VOP2SI_m <op, opName, p, pat32, revOp>; - - defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, - revOp, p.HasModifiers>; -} - -multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P, - SDPatternOperator node = null_frag, - string revOp = opName> - : VOP2_VI3_Helper < - op, opName, P, [], - !if(P.HasModifiers, - [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp ->; - -multiclass VOP2MADK <vop2 op, string opName, VOPProfile P, list<dag> pattern = []> { - - def "" : VOP2_Pseudo <P.Outs, P.Ins32, pattern, opName>; - -let isCodeGenOnly = 0 in { - def _si : VOP2Common <P.Outs, P.Ins32, - !strconcat(opName, P.Asm32), []>, - SIMCInstr <opName#"_e32", SIEncodingFamily.SI>, - VOP2_MADKe <op.SI> { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; - } - - def _vi : VOP2Common <P.Outs, P.Ins32, - !strconcat(opName, P.Asm32), []>, - SIMCInstr <opName#"_e32", SIEncodingFamily.VI>, - VOP2_MADKe <op.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; - } -} // End isCodeGenOnly = 0 -} - -class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> : - VOPCCommon <ins, "", pattern>, - VOP <opName>, - SIMCInstr<opName#"_e32", SIEncodingFamily.NONE> { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class VOPC_SDWA <vopc op, string opName, bit DefExec, VOPProfile p> : - VOPC_SDWAe <op.VI>, - VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>, - SDWADisableFields <p> { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let hasSideEffects = DefExec; - let AsmMatchConverter = "cvtSdwaVOPC"; - let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); - let DecoderNamespace = "SDWA"; - let DisableDecoder = DisableVIDecoder; -} - -multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern, - string opName, bit DefExec, VOPProfile p, - list<SchedReadWrite> sched, - string revOpName = "", string asm = opName#"_e32 "#op_asm, - string alias_asm = opName#" "#op_asm> { - def "" : VOPC_Pseudo <ins, pattern, opName>, - VOP2_REV<revOpName#"_e32", !eq(revOpName, opName)> { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let SchedRW = sched; - let isConvergent = DefExec; - } - - let AssemblerPredicates = [isSICI] in { - def _si : VOPC<op.SI, ins, asm, []>, - SIMCInstr <opName#"_e32", SIEncodingFamily.SI> { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let isConvergent = DefExec; - let SchedRW = sched; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; - } - - } // End AssemblerPredicates = [isSICI] - - let AssemblerPredicates = [isVI] in { - def _vi : VOPC<op.VI, ins, asm, []>, - SIMCInstr <opName#"_e32", SIEncodingFamily.VI> { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let isConvergent = DefExec; - let SchedRW = sched; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; - } - - } // End AssemblerPredicates = [isVI] - - defm : SIInstAliasBuilder<alias_asm, p>; -} - -multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32, - list<dag> pat64, bit DefExec, string revOp, - VOPProfile p, list<SchedReadWrite> sched> { - defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched, - revOp>; - - defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$sdst), p.Ins64, opName#p.Asm64, pat64, - opName, p.HasModifiers, DefExec, revOp, sched>; - - def _sdwa : VOPC_SDWA <op, opName, DefExec, p>; -} - -// Special case for class instructions which only have modifiers on -// the 1st source operand. -multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32, - list<dag> pat64, bit DefExec, string revOp, - VOPProfile p, list<SchedReadWrite> sched> { - defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>; - - defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$sdst), p.Ins64, opName#p.Asm64, pat64, - opName, p.HasModifiers, DefExec, revOp, sched>, - VOP3DisableModFields<1, 0, 0>; - - def _sdwa : VOPC_SDWA <op, opName, DefExec, p> { - let src1_fmodifiers = 0; - let src1_imodifiers = ?; - } -} - -multiclass VOPCInst <vopc op, string opName, - VOPProfile P, PatLeaf cond = COND_NULL, - string revOp = opName, - bit DefExec = 0, - list<SchedReadWrite> sched = [Write32Bit]> : - VOPC_Helper < - op, opName, [], - !if(P.HasModifiers, - [(set i1:$sdst, - (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - cond))], - [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), - DefExec, revOp, P, sched ->; - -multiclass VOPCClassInst <vopc op, string opName, VOPProfile P, - bit DefExec = 0, - list<SchedReadWrite> sched> : VOPC_Class_Helper < - op, opName, [], - !if(P.HasModifiers, - [(set i1:$sdst, - (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], - [(set i1:$sdst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), - DefExec, opName, P, sched ->; - - -multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOPC_I1_F32_F32, cond, revOp>; - -multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOPC_I1_F64_F64, cond, revOp, 0, [WriteDoubleAdd]>; - -multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOPC_I1_I32_I32, cond, revOp>; - -multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOPC_I1_I64_I64, cond, revOp, 0, [Write64Bit]>; - - -multiclass VOPCX <vopc op, string opName, VOPProfile P, - PatLeaf cond = COND_NULL, - list<SchedReadWrite> sched, - string revOp = ""> - : VOPCInst <op, opName, P, cond, revOp, 1, sched>; - -multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOPC_I1_F32_F32, COND_NULL, [Write32Bit], revOp>; - -multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOPC_I1_F64_F64, COND_NULL, [WriteDoubleAdd], revOp>; - -multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOPC_I1_I32_I32, COND_NULL, [Write32Bit], revOp>; - -multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>; - - -multiclass VOPC_CLASS_F32 <vopc op, string opName> : - VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>; - -multiclass VOPCX_CLASS_F32 <vopc op, string opName> : - VOPCClassInst <op, opName, VOPC_I1_F32_I32, 1, [Write32Bit]>; - -multiclass VOPC_CLASS_F64 <vopc op, string opName> : - VOPCClassInst <op, opName, VOPC_I1_F64_I32, 0, [WriteDoubleAdd]>; - -multiclass VOPCX_CLASS_F64 <vopc op, string opName> : - VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>; - - -multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, - list<dag> pat, int NumSrcArgs, bit HasMods, - bit VOP3Only = 0> : VOP3_m < - op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods, VOP3Only ->; - -multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, - SDPatternOperator node = null_frag, bit VOP3Only = 0> : - VOP3_Helper < - op, opName, (outs P.DstRC.RegClass:$vdst), P.Ins64, P.Asm64, - !if(!eq(P.NumSrcArgs, 3), - !if(P.HasModifiers, - [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))], - [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, - P.Src2VT:$src2))]), - !if(!eq(P.NumSrcArgs, 2), - !if(P.HasModifiers, - [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) - /* P.NumSrcArgs == 1 */, - !if(P.HasModifiers, - [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod))))], - [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]))), - P.NumSrcArgs, P.HasModifiers, VOP3Only ->; - -// Special case for v_div_fmas_{f32|f64}, since it seems to be the -// only VOP instruction that implicitly reads VCC. -multiclass VOP3_VCC_Inst <vop3 op, string opName, - VOPProfile P, - SDPatternOperator node = null_frag> : VOP3_Helper < - op, opName, - (outs P.DstRC.RegClass:$vdst), - (ins FPInputMods:$src0_modifiers, P.Src0RC64:$src0, - FPInputMods:$src1_modifiers, P.Src1RC64:$src1, - FPInputMods:$src2_modifiers, P.Src2RC64:$src2, - clampmod:$clamp, - omod:$omod), - "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", - [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)), - (i1 VCC)))], - 3, 1 ->; - -multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> : - VOP3b_2_3_m < - op, P.Outs64, P.Ins64, - opName#" "#P.Asm64, pattern, - opName, "", 1, 1, VOP3Only ->; - -class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))), - (Inst i32:$src0_modifiers, P.Src0VT:$src0, - i32:$src1_modifiers, P.Src1VT:$src1, - i32:$src2_modifiers, P.Src2VT:$src2, - i1:$clamp, - i32:$omod)>; - //===----------------------------------------------------------------------===// // Interpolation opcodes //===----------------------------------------------------------------------===// @@ -2551,1052 +1192,6 @@ multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm, } //===----------------------------------------------------------------------===// -// Vector I/O classes -//===----------------------------------------------------------------------===// - -class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : - DS <outs, ins, "", pattern>, - SIMCInstr <opName, SIEncodingFamily.NONE> { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : - DS <outs, ins, asm, []>, - DSe <op>, - SIMCInstr <opName, SIEncodingFamily.SI> { - let isCodeGenOnly = 0; - let AssemblerPredicates = [isSICI]; - let DecoderNamespace="SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : - DS <outs, ins, asm, []>, - DSe_vi <op>, - SIMCInstr <opName, SIEncodingFamily.VI> { - let isCodeGenOnly = 0; - let AssemblerPredicates = [isVI]; - let DecoderNamespace="VI"; - let DisableDecoder = DisableVIDecoder; -} - -class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : - DS_Real_si <op,opName, outs, ins, asm> { - - // Single load interpret the 2 i8imm operands as a single i16 offset. - bits<16> offset; - let offset0 = offset{7-0}; - let offset1 = offset{15-8}; -} - -class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : - DS_Real_vi <op, opName, outs, ins, asm> { - - // Single load interpret the 2 i8imm operands as a single i16 offset. - bits<16> offset; - let offset0 = offset{7-0}; - let offset1 = offset{15-8}; -} - -multiclass DS_1A_RET_ <dsop op, string opName, RegisterClass rc, - dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds), - string asm = opName#" $vdst, $addr"#"$offset$gds"> { - - def "" : DS_Pseudo <opName, outs, ins, []>; - - let data0 = 0, data1 = 0 in { - def _si : DS_Off16_Real_si <op.SI, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op.VI, opName, outs, ins, asm>; - } -} - -// TODO: DS_1A_RET can be inherited from DS_1A_RET_ but its not working -// for some reason. In fact we can remove this class if use dsop everywhere -multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc, - dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds), - string asm = opName#" $vdst, $addr"#"$offset$gds"> { - - def "" : DS_Pseudo <opName, outs, ins, []>; - - let data0 = 0, data1 = 0 in { - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; - } -} - -multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc, - dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, - gds:$gds), - string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> { - - def "" : DS_Pseudo <opName, outs, ins, []>; - - let data0 = 0, data1 = 0, AsmMatchConverter = "cvtDSOffset01" in { - def _si : DS_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Real_vi <op, opName, outs, ins, asm>; - } -} - -multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds), - string asm = opName#" $addr, $data0"#"$offset$gds"> { - - def "" : DS_Pseudo <opName, outs, ins, []>, - AtomicNoRet<opName, 0>; - - let data1 = 0, vdst = 0 in { - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; - } -} - -multiclass DS_1A_Off8_NORET <bits<8> op, string opName, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, - offset0:$offset0, offset1:$offset1, gds:$gds), - string asm = opName#" $addr $offset0"#"$offset1$gds"> { - - def "" : DS_Pseudo <opName, outs, ins, []>; - - let data0 = 0, data1 = 0, vdst = 0, AsmMatchConverter = "cvtDSOffset01" in { - def _si : DS_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Real_vi <op, opName, outs, ins, asm>; - } -} - -multiclass DS_1A2D_Off8_NORET <bits<8> op, string opName, RegisterClass rc, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, - offset0:$offset0, offset1:$offset1, gds:$gds), - string asm = opName#" $addr, $data0, $data1$offset0$offset1$gds"> { - - def "" : DS_Pseudo <opName, outs, ins, []>; - - let vdst = 0, AsmMatchConverter = "cvtDSOffset01" in { - def _si : DS_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Real_vi <op, opName, outs, ins, asm>; - } -} - -multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc, - string noRetOp = "", - dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds), - string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { - - let hasPostISelHook = 1 in { - def "" : DS_Pseudo <opName, outs, ins, []>, - AtomicNoRet<noRetOp, 1>; - - let data1 = 0 in { - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; - } - } -} - -multiclass DS_1A1D_PERMUTE <bits<8> op, string opName, RegisterClass rc, - SDPatternOperator node = null_frag, - dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset), - string asm = opName#" $vdst, $addr, $data0"#"$offset"> { - - let mayLoad = 0, mayStore = 0, isConvergent = 1 in { - def "" : DS_Pseudo <opName, outs, ins, - [(set i32:$vdst, - (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))]>; - - let data1 = 0, gds = 0 in { - def "_vi" : DS_Off16_Real_vi <op, opName, outs, ins, asm>; - } - } -} - -multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc, - string noRetOp = "", dag ins, - dag outs = (outs rc:$vdst), - string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> { - - let hasPostISelHook = 1 in { - def "" : DS_Pseudo <opName, outs, ins, []>, - AtomicNoRet<noRetOp, 1>; - - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; - } -} - -multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, - string noRetOp = "", RegisterClass src = rc> : - DS_1A2D_RET_m <op, asm, rc, noRetOp, - (ins VGPR_32:$addr, src:$data0, src:$data1, - offset:$offset, gds:$gds) ->; - -multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc, - string noRetOp = opName, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, - offset:$offset, gds:$gds), - string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> { - - def "" : DS_Pseudo <opName, outs, ins, []>, - AtomicNoRet<noRetOp, 0>; - - let vdst = 0 in { - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; - } -} - -multiclass DS_0A_RET <bits<8> op, string opName, - dag outs = (outs VGPR_32:$vdst), - dag ins = (ins offset:$offset, gds:$gds), - string asm = opName#" $vdst"#"$offset"#"$gds"> { - - let mayLoad = 1, mayStore = 1 in { - def "" : DS_Pseudo <opName, outs, ins, []>; - - let addr = 0, data0 = 0, data1 = 0 in { - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; - } // end addr = 0, data0 = 0, data1 = 0 - } // end mayLoad = 1, mayStore = 1 -} - -multiclass DS_1A_RET_GDS <bits<8> op, string opName, - dag outs = (outs VGPR_32:$vdst), - dag ins = (ins VGPR_32:$addr, offset:$offset), - string asm = opName#" $vdst, $addr"#"$offset gds"> { - - def "" : DS_Pseudo <opName, outs, ins, []>; - - let data0 = 0, data1 = 0, gds = 1 in { - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; - } // end data0 = 0, data1 = 0, gds = 1 -} - -multiclass DS_1A_GDS <bits<8> op, string opName, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr), - string asm = opName#" $addr gds"> { - - def "" : DS_Pseudo <opName, outs, ins, []>; - - let vdst = 0, data0 = 0, data1 = 0, offset0 = 0, offset1 = 0, gds = 1 in { - def _si : DS_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Real_vi <op, opName, outs, ins, asm>; - } // end vdst = 0, data = 0, data1 = 0, gds = 1 -} - -multiclass DS_1A <bits<8> op, string opName, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds), - string asm = opName#" $addr"#"$offset"#"$gds"> { - - let mayLoad = 1, mayStore = 1 in { - def "" : DS_Pseudo <opName, outs, ins, []>; - - let vdst = 0, data0 = 0, data1 = 0 in { - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; - } // let vdst = 0, data0 = 0, data1 = 0 - } // end mayLoad = 1, mayStore = 1 -} - -//===----------------------------------------------------------------------===// -// MTBUF classes -//===----------------------------------------------------------------------===// - -class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : - MTBUF <outs, ins, "", pattern>, - SIMCInstr<opName, SIEncodingFamily.NONE> { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins, - string asm> : - MTBUF <outs, ins, asm, []>, - MTBUFe <op>, - SIMCInstr<opName, SIEncodingFamily.SI> { - let DecoderNamespace="SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> : - MTBUF <outs, ins, asm, []>, - MTBUFe_vi <op>, - SIMCInstr <opName, SIEncodingFamily.VI> { - let DecoderNamespace="VI"; - let DisableDecoder = DisableVIDecoder; -} - -multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm, - list<dag> pattern> { - - def "" : MTBUF_Pseudo <opName, outs, ins, pattern>; - - def _si : MTBUF_Real_si <op, opName, outs, ins, asm>; - - def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>; - -} - -let mayStore = 1, mayLoad = 0 in { - -multiclass MTBUF_Store_Helper <bits<3> op, string opName, - RegisterClass regClass> : MTBUF_m < - op, opName, (outs), - (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, - i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, - SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), - opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," - #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] ->; - -} // mayStore = 1, mayLoad = 0 - -let mayLoad = 1, mayStore = 0 in { - -multiclass MTBUF_Load_Helper <bits<3> op, string opName, - RegisterClass regClass> : MTBUF_m < - op, opName, (outs regClass:$dst), - (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, - i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc, - i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), - opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," - #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] ->; - -} // mayLoad = 1, mayStore = 0 - -//===----------------------------------------------------------------------===// -// MUBUF classes -//===----------------------------------------------------------------------===// - -class mubuf <bits<7> si, bits<7> vi = si> { - field bits<7> SI = si; - field bits<7> VI = vi; -} - -let isCodeGenOnly = 0 in { - -class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - MUBUF <outs, ins, asm, pattern>, MUBUFe <op> { - let lds = 0; -} - -} // End let isCodeGenOnly = 0 - -class MUBUF_vi <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - MUBUF <outs, ins, asm, pattern>, MUBUFe_vi <op> { - let lds = 0; -} - -class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { - bit IsAddr64 = is_addr64; - string OpName = NAME # suffix; -} - -class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : - MUBUF <outs, ins, "", pattern>, - SIMCInstr<opName, SIEncodingFamily.NONE> { - let isPseudo = 1; - let isCodeGenOnly = 1; - - // dummy fields, so that we can use let statements around multiclasses - bits<1> offen; - bits<1> idxen; - bits<8> vaddr; - bits<1> glc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; -} - -class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins, - string asm> : - MUBUF <outs, ins, asm, []>, - MUBUFe <op.SI>, - SIMCInstr<opName, SIEncodingFamily.SI> { - let lds = 0; - let AssemblerPredicate = SIAssemblerPredicate; - let DecoderNamespace="SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins, - string asm> : - MUBUF <outs, ins, asm, []>, - MUBUFe_vi <op.VI>, - SIMCInstr<opName, SIEncodingFamily.VI> { - let lds = 0; - let AssemblerPredicate = VIAssemblerPredicate; - let DecoderNamespace="VI"; - let DisableDecoder = DisableVIDecoder; -} - -multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm, - list<dag> pattern> { - - def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, - MUBUFAddr64Table <0>; - - let DisableWQM = 1 in { - def "_exact" : MUBUF_Pseudo <opName, outs, ins, []>; - } - - let addr64 = 0, isCodeGenOnly = 0 in { - def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; - } - - def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>; -} - -multiclass MUBUFAddr64_m <mubuf op, string opName, dag outs, - dag ins, string asm, list<dag> pattern> { - - def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, - MUBUFAddr64Table <1>; - - let addr64 = 1, isCodeGenOnly = 0 in { - def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; - } - - // There is no VI version. If the pseudo is selected, it should be lowered - // for VI appropriately. -} - -multiclass MUBUFAtomicOffset_m <mubuf op, string opName, dag outs, dag ins, - string asm, list<dag> pattern, bit is_return> { - - def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, - MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>, - AtomicNoRet<NAME#"_OFFSET", is_return>; - - let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in { - let addr64 = 0 in { - def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; - } - - def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>; - } -} - -multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins, - string asm, list<dag> pattern, bit is_return> { - - def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, - MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>, - AtomicNoRet<NAME#"_ADDR64", is_return>; - - let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in { - def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; - } - - // There is no VI version. If the pseudo is selected, it should be lowered - // for VI appropriately. -} - -multiclass MUBUFAtomicOther_m <mubuf op, string opName, dag outs, dag ins, - string asm, list<dag> pattern, bit is_return> { - - def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, - AtomicNoRet<opName, is_return>; - - let tfe = 0 in { - let addr64 = 0 in { - def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; - } - - def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>; - } -} - -multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, - ValueType vt, SDPatternOperator atomic> { - - let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1, - DisableWQM = 1 in { - - // No return variants - let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in { - - defm _ADDR64 : MUBUFAtomicAddr64_m < - op, name#"_addr64", (outs), - (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset$slc", [], 0 - >; - - defm _OFFSET : MUBUFAtomicOffset_m < - op, name#"_offset", (outs), - (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, offset:$offset, - slc:$slc), - name#" $vdata, off, $srsrc, $soffset$offset$slc", [], 0 - >; - - let offen = 1, idxen = 0 in { - defm _OFFEN : MUBUFAtomicOther_m < - op, name#"_offen", (outs), - (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, - offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset offen$offset$slc", [], 0 - >; - } - - let offen = 0, idxen = 1 in { - defm _IDXEN : MUBUFAtomicOther_m < - op, name#"_idxen", (outs), - (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, - offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$slc", [], 0 - >; - } - - let offen = 1, idxen = 1 in { - defm _BOTHEN : MUBUFAtomicOther_m < - op, name#"_bothen", (outs), - (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, - offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$slc", - [], 0 - >; - } - } // glc = 0 - - // Variant that return values - let glc = 1, Constraints = "$vdata = $vdata_in", - AsmMatchConverter = "cvtMubufAtomicReturn", - DisableEncoding = "$vdata_in" in { - - defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < - op, name#"_rtn_addr64", (outs rc:$vdata), - (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset glc$slc", - [(set vt:$vdata, - (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$slc), vt:$vdata_in))], 1 - >; - - defm _RTN_OFFSET : MUBUFAtomicOffset_m < - op, name#"_rtn_offset", (outs rc:$vdata), - (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, - offset:$offset, slc:$slc), - name#" $vdata, off, $srsrc, $soffset$offset glc$slc", - [(set vt:$vdata, - (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, - i1:$slc), vt:$vdata_in))], 1 - >; - - let offen = 1, idxen = 0 in { - defm _RTN_OFFEN : MUBUFAtomicOther_m < - op, name#"_rtn_offen", (outs rc:$vdata), - (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, - offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset offen$offset glc$slc", - [], 1 - >; - } - - let offen = 0, idxen = 1 in { - defm _RTN_IDXEN : MUBUFAtomicOther_m < - op, name#"_rtn_idxen", (outs rc:$vdata), - (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, - offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset glc$slc", - [], 1 - >; - } - - let offen = 1, idxen = 1 in { - defm _RTN_BOTHEN : MUBUFAtomicOther_m < - op, name#"_rtn_bothen", (outs rc:$vdata), - (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, - offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset glc$slc", - [], 1 - >; - } - } // glc = 1 - - } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 -} - -// FIXME: tfe can't be an operand because it requires a separate -// opcode because it needs an N+1 register class dest register. -multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, - ValueType load_vt = i32, - SDPatternOperator ld = null_frag> { - - let mayLoad = 1, mayStore = 0 in { - let offen = 0, idxen = 0, vaddr = 0 in { - defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata), - (ins SReg_128:$srsrc, SCSrc_32:$soffset, - offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, off, $srsrc, $soffset$offset$glc$slc$tfe", - [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, - i32:$soffset, i16:$offset, - i1:$glc, i1:$slc, i1:$tfe)))]>; - } - - let offen = 1, idxen = 0 in { - defm _OFFEN : MUBUF_m <op, name#"_offen", (outs regClass:$vdata), - (ins VGPR_32:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, offset:$offset, glc:$glc, slc:$slc, - tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset offen$offset$glc$slc$tfe", []>; - } - - let offen = 0, idxen = 1 in { - defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata), - (ins VGPR_32:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, offset:$offset, glc:$glc, - slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$glc$slc$tfe", []>; - } - - let offen = 1, idxen = 1 in { - defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata), - (ins VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, - offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$glc$slc$tfe", []>; - } - - let offen = 0, idxen = 0 in { - defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata), - (ins VReg_64:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, offset:$offset, - glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset$glc$slc$tfe", - [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, - i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, - i1:$tfe)))]>; - } - } -} - -multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, - ValueType store_vt = i32, SDPatternOperator st = null_frag> { - let mayLoad = 0, mayStore = 1 in { - let offen = 0, idxen = 0, vaddr = 0 in { - defm _OFFSET : MUBUF_m <op, name#"_offset",(outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, - offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, off, $srsrc, $soffset$offset$glc$slc$tfe", - [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>; - } // offen = 0, idxen = 0, vaddr = 0 - - let offen = 1, idxen = 0 in { - defm _OFFEN : MUBUF_m <op, name#"_offen", (outs), - (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, offset:$offset, glc:$glc, - slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset offen"# - "$offset$glc$slc$tfe", []>; - } // end offen = 1, idxen = 0 - - let offen = 0, idxen = 1 in { - defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs), - (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, offset:$offset, glc:$glc, - slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$glc$slc$tfe", []>; - } - - let offen = 1, idxen = 1 in { - defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs), - (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, - offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$glc$slc$tfe", []>; - } - - let offen = 0, idxen = 0 in { - defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs), - (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, - offset:$offset, glc:$glc, slc:$slc, - tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset addr64"# - "$offset$glc$slc$tfe", - [(st store_vt:$vdata, - (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, - i32:$soffset, i16:$offset, - i1:$glc, i1:$slc, i1:$tfe))]>; - } - } // End mayLoad = 0, mayStore = 1 -} - -// For cache invalidation instructions. -multiclass MUBUF_Invalidate <mubuf op, string opName, SDPatternOperator node> { - let hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" in { - def "" : MUBUF_Pseudo <opName, (outs), (ins), [(node)]>; - - // Set everything to 0. - let offset = 0, offen = 0, idxen = 0, glc = 0, vaddr = 0, - vdata = 0, srsrc = 0, slc = 0, tfe = 0, soffset = 0 in { - let addr64 = 0 in { - def _si : MUBUF_Real_si <op, opName, (outs), (ins), opName>; - } - - def _vi : MUBUF_Real_vi <op, opName, (outs), (ins), opName>; - } - } // End hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" -} - -//===----------------------------------------------------------------------===// -// FLAT classes -//===----------------------------------------------------------------------===// - -class flat <bits<7> ci, bits<7> vi = ci> { - field bits<7> CI = ci; - field bits<7> VI = vi; -} - -class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : - FLAT <0, outs, ins, "", pattern>, - SIMCInstr<opName, SIEncodingFamily.NONE> { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> : - FLAT <op, outs, ins, asm, []>, - SIMCInstr<opName, SIEncodingFamily.SI> { - let AssemblerPredicate = isCIOnly; - let DecoderNamespace="CI"; -} - -class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> : - FLAT <op, outs, ins, asm, []>, - SIMCInstr<opName, SIEncodingFamily.VI> { - let AssemblerPredicate = VIAssemblerPredicate; - let DecoderNamespace="VI"; - let DisableDecoder = DisableVIDecoder; -} - -multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm, - list<dag> pattern> { - def "" : FLAT_Pseudo <NAME#"_RTN", outs, ins, pattern>, - AtomicNoRet <NAME, 1>; - - def _ci : FLAT_Real_ci <op.CI, NAME#"_RTN", outs, ins, asm>; - - def _vi : FLAT_Real_vi <op.VI, NAME#"_RTN", outs, ins, asm>; -} - -multiclass FLAT_Load_Helper <flat op, string asm_name, - RegisterClass regClass, - dag outs = (outs regClass:$vdst), - dag ins = (ins VReg_64:$addr, glc:$glc, slc:$slc, tfe:$tfe), - string asm = asm_name#" $vdst, $addr$glc$slc$tfe"> { - - let data = 0, mayLoad = 1 in { - - def "" : FLAT_Pseudo <NAME, outs, ins, []>; - - def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>; - - def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>; - } -} - -multiclass FLAT_Store_Helper <flat op, string asm_name, - RegisterClass vdataClass, - dag outs = (outs), - dag ins = (ins VReg_64:$addr, vdataClass:$data, glc:$glc, - slc:$slc, tfe:$tfe), - string asm = asm_name#" $addr, $data$glc$slc$tfe"> { - - let mayLoad = 0, mayStore = 1, vdst = 0 in { - - def "" : FLAT_Pseudo <NAME, outs, ins, []>; - - def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>; - - def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>; - } -} - -multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc, - ValueType vt, SDPatternOperator atomic = null_frag, - ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc, - string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> { - - let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in { - def "" : FLAT_Pseudo <NAME, (outs), - (ins VReg_64:$addr, data_rc:$data, - slc:$slc, tfe:$tfe), []>, - AtomicNoRet <NAME, 0>; - - def _ci : FLAT_Real_ci <op.CI, NAME, (outs), - (ins VReg_64:$addr, data_rc:$data, - slc:$slc, tfe:$tfe), - asm_noret>; - - def _vi : FLAT_Real_vi <op.VI, NAME, (outs), - (ins VReg_64:$addr, data_rc:$data, - slc:$slc, tfe:$tfe), - asm_noret>; - } - - let glc = 1, hasPostISelHook = 1 in { - defm _RTN : FLAT_AtomicRet_m < - op, (outs vdst_rc:$vdst), - (ins VReg_64:$addr, data_rc:$data, slc:$slc, tfe:$tfe), - asm_name#" $vdst, $addr, $data glc$slc$tfe", - [(set vt:$vdst, - (atomic (FLATAtomic i64:$addr, i1:$slc, i1:$tfe), data_vt:$data))] - >; - } -} - -class MIMG_Mask <string op, int channels> { - string Op = op; - int Channels = channels; -} - -class mimg <bits<7> si, bits<7> vi = si> { - field bits<7> SI = si; - field bits<7> VI = vi; -} - -class MIMG_Helper <dag outs, dag ins, string asm, - string dns=""> : MIMG<outs, ins, asm,[]> { - let mayLoad = 1; - let mayStore = 0; - let hasPostISelHook = 1; - let DecoderNamespace = dns; - let isAsmParserOnly = !if(!eq(dns,""), 1, 0); - let AsmMatchConverter = "cvtMIMG"; -} - -class MIMG_NoSampler_Helper <bits<7> op, string asm, - RegisterClass dst_rc, - RegisterClass addr_rc, - string dns=""> : MIMG_Helper < - (outs dst_rc:$vdata), - (ins addr_rc:$vaddr, SReg_256:$srsrc, - dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", - dns>, MIMGe<op> { - let ssamp = 0; -} - -multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm, - RegisterClass dst_rc, - int channels> { - def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32, - !if(!eq(channels, 1), "AMDGPU", "")>, - MIMG_Mask<asm#"_V1", channels>; - def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>, - MIMG_Mask<asm#"_V2", channels>; - def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>, - MIMG_Mask<asm#"_V4", channels>; -} - -multiclass MIMG_NoSampler <bits<7> op, string asm> { - defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>; - defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>; - defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>; - defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>; -} - -class MIMG_Store_Helper <bits<7> op, string asm, - RegisterClass data_rc, - RegisterClass addr_rc> : MIMG_Helper < - (outs), - (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, - dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" - >, MIMGe<op> { - let ssamp = 0; - let mayLoad = 1; // TableGen requires this for matching with the intrinsics - let mayStore = 1; - let hasSideEffects = 1; - let hasPostISelHook = 0; - let DisableWQM = 1; -} - -multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm, - RegisterClass data_rc, - int channels> { - def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32>, - MIMG_Mask<asm#"_V1", channels>; - def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>, - MIMG_Mask<asm#"_V2", channels>; - def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>, - MIMG_Mask<asm#"_V4", channels>; -} - -multiclass MIMG_Store <bits<7> op, string asm> { - defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>; - defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>; - defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>; - defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>; -} - -class MIMG_Atomic_Helper <string asm, RegisterClass data_rc, - RegisterClass addr_rc> : MIMG_Helper < - (outs data_rc:$vdst), - (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, - dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" - > { - let mayStore = 1; - let hasSideEffects = 1; - let hasPostISelHook = 0; - let DisableWQM = 1; - let Constraints = "$vdst = $vdata"; - let AsmMatchConverter = "cvtMIMGAtomic"; -} - -class MIMG_Atomic_Real_si<mimg op, string name, string asm, - RegisterClass data_rc, RegisterClass addr_rc> : - MIMG_Atomic_Helper<asm, data_rc, addr_rc>, - SIMCInstr<name, SIEncodingFamily.SI>, - MIMGe<op.SI> { - let isCodeGenOnly = 0; - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class MIMG_Atomic_Real_vi<mimg op, string name, string asm, - RegisterClass data_rc, RegisterClass addr_rc> : - MIMG_Atomic_Helper<asm, data_rc, addr_rc>, - SIMCInstr<name, SIEncodingFamily.VI>, - MIMGe<op.VI> { - let isCodeGenOnly = 0; - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; -} - -multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm, - RegisterClass data_rc, RegisterClass addr_rc> { - let isPseudo = 1, isCodeGenOnly = 1 in { - def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>, - SIMCInstr<name, SIEncodingFamily.NONE>; - } - - let ssamp = 0 in { - def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>; - - def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>; - } -} - -multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> { - defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>; - defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>; - defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>; -} - -class MIMG_Sampler_Helper <bits<7> op, string asm, - RegisterClass dst_rc, - RegisterClass src_rc, - int wqm, - string dns=""> : MIMG_Helper < - (outs dst_rc:$vdata), - (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, - dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", - dns>, MIMGe<op> { - let WQM = wqm; -} - -multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, - RegisterClass dst_rc, - int channels, int wqm> { - def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm, - !if(!eq(channels, 1), "AMDGPU", "")>, - MIMG_Mask<asm#"_V1", channels>; - def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>, - MIMG_Mask<asm#"_V2", channels>; - def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>, - MIMG_Mask<asm#"_V4", channels>; - def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>, - MIMG_Mask<asm#"_V8", channels>; - def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>, - MIMG_Mask<asm#"_V16", channels>; -} - -multiclass MIMG_Sampler <bits<7> op, string asm, int wqm=0> { - defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>; - defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>; - defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>; - defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>; -} - -multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>; - -class MIMG_Gather_Helper <bits<7> op, string asm, - RegisterClass dst_rc, - RegisterClass src_rc, int wqm> : MIMG < - (outs dst_rc:$vdata), - (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, - dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", - []>, MIMGe<op> { - let mayLoad = 1; - let mayStore = 0; - - // DMASK was repurposed for GATHER4. 4 components are always - // returned and DMASK works like a swizzle - it selects - // the component to fetch. The only useful DMASK values are - // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns - // (red,red,red,red) etc.) The ISA document doesn't mention - // this. - // Therefore, disable all code which updates DMASK by setting this: - let Gather4 = 1; - let hasPostISelHook = 0; - let WQM = wqm; - - let isAsmParserOnly = 1; // TBD: fix it later -} - -multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, - RegisterClass dst_rc, - int channels, int wqm> { - def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>, - MIMG_Mask<asm#"_V1", channels>; - def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>, - MIMG_Mask<asm#"_V2", channels>; - def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>, - MIMG_Mask<asm#"_V4", channels>; - def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>, - MIMG_Mask<asm#"_V8", channels>; - def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>, - MIMG_Mask<asm#"_V16", channels>; -} - -multiclass MIMG_Gather <bits<7> op, string asm, int wqm=0> { - defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>; - defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>; - defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>; - defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>; -} - -multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>; - -//===----------------------------------------------------------------------===// // Vector instruction mappings //===----------------------------------------------------------------------===// @@ -3604,18 +1199,18 @@ multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>; def getVOPe64 : InstrMapping { let FilterClass = "VOP"; let RowFields = ["OpName"]; - let ColFields = ["Size"]; - let KeyCol = ["4"]; - let ValueCols = [["8"]]; + let ColFields = ["Size", "VOP3"]; + let KeyCol = ["4", "0"]; + let ValueCols = [["8", "1"]]; } // Maps an opcode in e64 form to its e32 equivalent def getVOPe32 : InstrMapping { let FilterClass = "VOP"; let RowFields = ["OpName"]; - let ColFields = ["Size"]; - let KeyCol = ["8"]; - let ValueCols = [["4"]]; + let ColFields = ["Size", "VOP3"]; + let KeyCol = ["8", "1"]; + let ValueCols = [["4", "0"]]; } def getMaskedMIMGOp : InstrMapping { @@ -3628,7 +1223,7 @@ def getMaskedMIMGOp : InstrMapping { // Maps an commuted opcode to its original version def getCommuteOrig : InstrMapping { - let FilterClass = "VOP2_REV"; + let FilterClass = "Commutable_REV"; let RowFields = ["RevOp"]; let ColFields = ["IsOrig"]; let KeyCol = ["0"]; @@ -3637,31 +1232,13 @@ def getCommuteOrig : InstrMapping { // Maps an original opcode to its commuted version def getCommuteRev : InstrMapping { - let FilterClass = "VOP2_REV"; - let RowFields = ["RevOp"]; - let ColFields = ["IsOrig"]; - let KeyCol = ["1"]; - let ValueCols = [["0"]]; -} - -def getCommuteCmpOrig : InstrMapping { - let FilterClass = "VOP2_REV"; - let RowFields = ["RevOp"]; - let ColFields = ["IsOrig"]; - let KeyCol = ["0"]; - let ValueCols = [["1"]]; -} - -// Maps an original opcode to its commuted version -def getCommuteCmpRev : InstrMapping { - let FilterClass = "VOP2_REV"; + let FilterClass = "Commutable_REV"; let RowFields = ["RevOp"]; let ColFields = ["IsOrig"]; let KeyCol = ["1"]; let ValueCols = [["0"]]; } - def getMCOpcodeGen : InstrMapping { let FilterClass = "SIMCInstr"; let RowFields = ["PseudoInstr"]; @@ -3671,6 +1248,15 @@ def getMCOpcodeGen : InstrMapping { [!cast<string>(SIEncodingFamily.VI)]]; } +// Get equivalent SOPK instruction. +def getSOPKOp : InstrMapping { + let FilterClass = "SOPKInstTable"; + let RowFields = ["BaseCmpOp"]; + let ColFields = ["IsSOPK"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + def getAddr64Inst : InstrMapping { let FilterClass = "MUBUFAddr64Table"; let RowFields = ["OpName"]; @@ -3699,4 +1285,6 @@ def getAtomicNoRetOp : InstrMapping { include "SIInstructions.td" include "CIInstructions.td" -include "VIInstructions.td" + +include "DSInstructions.td" +include "MIMGInstructions.td" diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td index dde5f2f..38e31e7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -11,13 +11,6 @@ // that are not yet supported remain commented out. //===----------------------------------------------------------------------===// -class InterpSlots { -int P0 = 2; -int P10 = 0; -int P20 = 1; -} -def INTERP : InterpSlots; - def isGCN : Predicate<"Subtarget->getGeneration() " ">= SISubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureGCN">; @@ -25,9 +18,18 @@ def isSI : Predicate<"Subtarget->getGeneration() " "== SISubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureSouthernIslands">; - def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; +def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, + AssemblerPredicate<"FeatureVGPRIndexMode">; +def HasMovrel : Predicate<"Subtarget->hasMovrel()">, + AssemblerPredicate<"FeatureMovrel">; + +include "VOPInstructions.td" +include "SOPInstructions.td" +include "SMInstructions.td" +include "FLATInstructions.td" +include "BUFInstructions.td" let SubtargetPredicate = isGCN in { @@ -35,1393 +37,8 @@ let SubtargetPredicate = isGCN in { // EXP Instructions //===----------------------------------------------------------------------===// -defm EXP : EXP_m; - -//===----------------------------------------------------------------------===// -// SMRD Instructions -//===----------------------------------------------------------------------===// - -// We are using the SReg_32_XM0 and not the SReg_32 register class for 32-bit -// SMRD instructions, because the SReg_32_XM0 register class does not include M0 -// and writing to M0 from an SMRD instruction will hang the GPU. -defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SReg_32_XM0>; -defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>; -defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>; -defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>; -defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>; - -defm S_BUFFER_LOAD_DWORD : SMRD_Helper < - smrd<0x08>, "s_buffer_load_dword", SReg_128, SReg_32_XM0 ->; - -defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < - smrd<0x09>, "s_buffer_load_dwordx2", SReg_128, SReg_64 ->; - -defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < - smrd<0x0a>, "s_buffer_load_dwordx4", SReg_128, SReg_128 ->; - -defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < - smrd<0x0b>, "s_buffer_load_dwordx8", SReg_128, SReg_256 ->; - -defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < - smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512 ->; - -let mayStore = ? in { -// FIXME: mayStore = ? is a workaround for tablegen bug for different -// inferred mayStore flags for the instruction pattern vs. standalone -// Pat. Each considers the other contradictory. - -defm S_MEMTIME : SMRD_Special <smrd<0x1e, 0x24>, "s_memtime", - (outs SReg_64:$sdst), ?, " $sdst", [(set i64:$sdst, (int_amdgcn_s_memtime))] ->; -} - -defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv", - int_amdgcn_s_dcache_inv>; - -//===----------------------------------------------------------------------===// -// SOP1 Instructions -//===----------------------------------------------------------------------===// - -let isMoveImm = 1 in { - let isReMaterializable = 1, isAsCheapAsAMove = 1 in { - defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>; - defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>; - } // End isRematerializeable = 1 - - let Uses = [SCC] in { - defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>; - defm S_CMOV_B64 : SOP1_64 <sop1<0x06, 0x03>, "s_cmov_b64", []>; - } // End Uses = [SCC] -} // End isMoveImm = 1 - -let Defs = [SCC] in { - defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32", - [(set i32:$sdst, (not i32:$src0))] - >; - - defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64", - [(set i64:$sdst, (not i64:$src0))] - >; - defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>; - defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>; -} // End Defs = [SCC] - - -defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32", - [(set i32:$sdst, (bitreverse i32:$src0))] ->; -defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>; - -let Defs = [SCC] in { - defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>; - defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>; - defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32", - [(set i32:$sdst, (ctpop i32:$src0))] - >; - defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>; -} // End Defs = [SCC] - -defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>; -defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>; -defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32", - [(set i32:$sdst, (cttz_zero_undef i32:$src0))] ->; -defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>; - -defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32", - [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))] ->; - -defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>; -defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", - [(set i32:$sdst, (int_AMDGPU_flbit_i32 i32:$src0))] ->; -defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>; -defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8", - [(set i32:$sdst, (sext_inreg i32:$src0, i8))] ->; -defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16", - [(set i32:$sdst, (sext_inreg i32:$src0, i16))] ->; - -defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>; -defm S_BITSET0_B64 : SOP1_64_32 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>; -defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>; -defm S_BITSET1_B64 : SOP1_64_32 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>; -defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>; -defm S_SETPC_B64 : SOP1_1 <sop1<0x20, 0x1d>, "s_setpc_b64", []>; -defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>; -defm S_RFE_B64 : SOP1_1 <sop1<0x22, 0x1f>, "s_rfe_b64", []>; - -let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { - -defm S_AND_SAVEEXEC_B64 : SOP1_64 <sop1<0x24, 0x20>, "s_and_saveexec_b64", []>; -defm S_OR_SAVEEXEC_B64 : SOP1_64 <sop1<0x25, 0x21>, "s_or_saveexec_b64", []>; -defm S_XOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x26, 0x22>, "s_xor_saveexec_b64", []>; -defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x27, 0x23>, "s_andn2_saveexec_b64", []>; -defm S_ORN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x28, 0x24>, "s_orn2_saveexec_b64", []>; -defm S_NAND_SAVEEXEC_B64 : SOP1_64 <sop1<0x29, 0x25>, "s_nand_saveexec_b64", []>; -defm S_NOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2a, 0x26>, "s_nor_saveexec_b64", []>; -defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>; - -} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] - -defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>; -defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>; - -let Uses = [M0] in { -defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>; -defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>; -defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>; -defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>; -} // End Uses = [M0] - -defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>; -defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>; -let Defs = [SCC] in { - defm S_ABS_I32 : SOP1_32 <sop1<0x34, 0x30>, "s_abs_i32", []>; -} // End Defs = [SCC] -defm S_MOV_FED_B32 : SOP1_32 <sop1<0x35, 0x31>, "s_mov_fed_b32", []>; - -//===----------------------------------------------------------------------===// -// SOP2 Instructions -//===----------------------------------------------------------------------===// - -let Defs = [SCC] in { // Carry out goes to SCC -let isCommutable = 1 in { -defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>; -defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32", - [(set i32:$sdst, (add SSrc_32:$src0, SSrc_32:$src1))] ->; -} // End isCommutable = 1 - -defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>; -defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32", - [(set i32:$sdst, (sub SSrc_32:$src0, SSrc_32:$src1))] ->; - -let Uses = [SCC] in { // Carry in comes from SCC -let isCommutable = 1 in { -defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32", - [(set i32:$sdst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; -} // End isCommutable = 1 - -defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32", - [(set i32:$sdst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; -} // End Uses = [SCC] - -defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32", - [(set i32:$sdst, (smin i32:$src0, i32:$src1))] ->; -defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32", - [(set i32:$sdst, (umin i32:$src0, i32:$src1))] ->; -defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32", - [(set i32:$sdst, (smax i32:$src0, i32:$src1))] ->; -defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32", - [(set i32:$sdst, (umax i32:$src0, i32:$src1))] ->; -} // End Defs = [SCC] - - -let Uses = [SCC] in { - defm S_CSELECT_B32 : SOP2_32 <sop2<0x0a>, "s_cselect_b32", []>; - defm S_CSELECT_B64 : SOP2_64 <sop2<0x0b>, "s_cselect_b64", []>; -} // End Uses = [SCC] - -let Defs = [SCC] in { -defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32", - [(set i32:$sdst, (and i32:$src0, i32:$src1))] ->; - -defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64", - [(set i64:$sdst, (and i64:$src0, i64:$src1))] ->; - -defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32", - [(set i32:$sdst, (or i32:$src0, i32:$src1))] ->; - -defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64", - [(set i64:$sdst, (or i64:$src0, i64:$src1))] ->; - -defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32", - [(set i32:$sdst, (xor i32:$src0, i32:$src1))] ->; - -defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64", - [(set i64:$sdst, (xor i64:$src0, i64:$src1))] ->; -defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>; -defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>; -defm S_ORN2_B32 : SOP2_32 <sop2<0x16, 0x14>, "s_orn2_b32", []>; -defm S_ORN2_B64 : SOP2_64 <sop2<0x17, 0x15>, "s_orn2_b64", []>; -defm S_NAND_B32 : SOP2_32 <sop2<0x18, 0x16>, "s_nand_b32", []>; -defm S_NAND_B64 : SOP2_64 <sop2<0x19, 0x17>, "s_nand_b64", []>; -defm S_NOR_B32 : SOP2_32 <sop2<0x1a, 0x18>, "s_nor_b32", []>; -defm S_NOR_B64 : SOP2_64 <sop2<0x1b, 0x19>, "s_nor_b64", []>; -defm S_XNOR_B32 : SOP2_32 <sop2<0x1c, 0x1a>, "s_xnor_b32", []>; -defm S_XNOR_B64 : SOP2_64 <sop2<0x1d, 0x1b>, "s_xnor_b64", []>; -} // End Defs = [SCC] - -// Use added complexity so these patterns are preferred to the VALU patterns. -let AddedComplexity = 1 in { -let Defs = [SCC] in { - -defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32", - [(set i32:$sdst, (shl i32:$src0, i32:$src1))] ->; -defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64", - [(set i64:$sdst, (shl i64:$src0, i32:$src1))] ->; -defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32", - [(set i32:$sdst, (srl i32:$src0, i32:$src1))] ->; -defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64", - [(set i64:$sdst, (srl i64:$src0, i32:$src1))] ->; -defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32", - [(set i32:$sdst, (sra i32:$src0, i32:$src1))] ->; -defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64", - [(set i64:$sdst, (sra i64:$src0, i32:$src1))] ->; -} // End Defs = [SCC] - -defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", - [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>; -defm S_BFM_B64 : SOP2_64_32_32 <sop2<0x25, 0x23>, "s_bfm_b64", []>; -defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32", - [(set i32:$sdst, (mul i32:$src0, i32:$src1))] ->; - -} // End AddedComplexity = 1 - -let Defs = [SCC] in { -defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>; -defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>; -defm S_BFE_U64 : SOP2_64_32 <sop2<0x29, 0x27>, "s_bfe_u64", []>; -defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>; -} // End Defs = [SCC] - -let sdst = 0 in { -defm S_CBRANCH_G_FORK : SOP2_m < - sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs), - (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", [] ->; -} - -let Defs = [SCC] in { -defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>; -} // End Defs = [SCC] - -//===----------------------------------------------------------------------===// -// SOPC Instructions -//===----------------------------------------------------------------------===// - -def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00000000, "s_cmp_eq_i32", COND_EQ>; -def S_CMP_LG_I32 : SOPC_CMP_32 <0x00000001, "s_cmp_lg_i32", COND_NE>; -def S_CMP_GT_I32 : SOPC_CMP_32 <0x00000002, "s_cmp_gt_i32", COND_SGT>; -def S_CMP_GE_I32 : SOPC_CMP_32 <0x00000003, "s_cmp_ge_i32", COND_SGE>; -def S_CMP_LT_I32 : SOPC_CMP_32 <0x00000004, "s_cmp_lt_i32", COND_SLT>; -def S_CMP_LE_I32 : SOPC_CMP_32 <0x00000005, "s_cmp_le_i32", COND_SLE>; -def S_CMP_EQ_U32 : SOPC_CMP_32 <0x00000006, "s_cmp_eq_u32", COND_EQ>; -def S_CMP_LG_U32 : SOPC_CMP_32 <0x00000007, "s_cmp_lg_u32", COND_NE >; -def S_CMP_GT_U32 : SOPC_CMP_32 <0x00000008, "s_cmp_gt_u32", COND_UGT>; -def S_CMP_GE_U32 : SOPC_CMP_32 <0x00000009, "s_cmp_ge_u32", COND_UGE>; -def S_CMP_LT_U32 : SOPC_CMP_32 <0x0000000a, "s_cmp_lt_u32", COND_ULT>; -def S_CMP_LE_U32 : SOPC_CMP_32 <0x0000000b, "s_cmp_le_u32", COND_ULE>; -def S_BITCMP0_B32 : SOPC_32 <0x0000000c, "s_bitcmp0_b32">; -def S_BITCMP1_B32 : SOPC_32 <0x0000000d, "s_bitcmp1_b32">; -def S_BITCMP0_B64 : SOPC_64_32 <0x0000000e, "s_bitcmp0_b64">; -def S_BITCMP1_B64 : SOPC_64_32 <0x0000000f, "s_bitcmp1_b64">; -def S_SETVSKIP : SOPC_32 <0x00000010, "s_setvskip">; - -//===----------------------------------------------------------------------===// -// SOPK Instructions -//===----------------------------------------------------------------------===// - -let isReMaterializable = 1, isMoveImm = 1 in { -defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>; -} // End isReMaterializable = 1 -let Uses = [SCC] in { - defm S_CMOVK_I32 : SOPK_32 <sopk<0x02, 0x01>, "s_cmovk_i32", []>; -} - -let isCompare = 1 in { - -/* -This instruction is disabled for now until we can figure out how to teach -the instruction selector to correctly use the S_CMP* vs V_CMP* -instructions. - -When this instruction is enabled the code generator sometimes produces this -invalid sequence: - -SCC = S_CMPK_EQ_I32 SGPR0, imm -VCC = COPY SCC -VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 - -defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", - [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] ->; -*/ - -defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", []>; -defm S_CMPK_LG_I32 : SOPK_SCC <sopk<0x04, 0x03>, "s_cmpk_lg_i32", []>; -defm S_CMPK_GT_I32 : SOPK_SCC <sopk<0x05, 0x04>, "s_cmpk_gt_i32", []>; -defm S_CMPK_GE_I32 : SOPK_SCC <sopk<0x06, 0x05>, "s_cmpk_ge_i32", []>; -defm S_CMPK_LT_I32 : SOPK_SCC <sopk<0x07, 0x06>, "s_cmpk_lt_i32", []>; -defm S_CMPK_LE_I32 : SOPK_SCC <sopk<0x08, 0x07>, "s_cmpk_le_i32", []>; -defm S_CMPK_EQ_U32 : SOPK_SCC <sopk<0x09, 0x08>, "s_cmpk_eq_u32", []>; -defm S_CMPK_LG_U32 : SOPK_SCC <sopk<0x0a, 0x09>, "s_cmpk_lg_u32", []>; -defm S_CMPK_GT_U32 : SOPK_SCC <sopk<0x0b, 0x0a>, "s_cmpk_gt_u32", []>; -defm S_CMPK_GE_U32 : SOPK_SCC <sopk<0x0c, 0x0b>, "s_cmpk_ge_u32", []>; -defm S_CMPK_LT_U32 : SOPK_SCC <sopk<0x0d, 0x0c>, "s_cmpk_lt_u32", []>; -defm S_CMPK_LE_U32 : SOPK_SCC <sopk<0x0e, 0x0d>, "s_cmpk_le_u32", []>; -} // End isCompare = 1 - -let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", - Constraints = "$sdst = $src0" in { - defm S_ADDK_I32 : SOPK_32TIE <sopk<0x0f, 0x0e>, "s_addk_i32", []>; - defm S_MULK_I32 : SOPK_32TIE <sopk<0x10, 0x0f>, "s_mulk_i32", []>; -} - -defm S_CBRANCH_I_FORK : SOPK_m < - sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs), - (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16" ->; - -let mayLoad = 1 in { -defm S_GETREG_B32 : SOPK_m < - sopk<0x12, 0x11>, "s_getreg_b32", (outs SReg_32:$sdst), - (ins hwreg:$simm16), " $sdst, $simm16" ->; -} - -defm S_SETREG_B32 : SOPK_m < - sopk<0x13, 0x12>, "s_setreg_b32", (outs), - (ins SReg_32:$sdst, hwreg:$simm16), " $simm16, $sdst" ->; -// FIXME: Not on SI? -//defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>; -defm S_SETREG_IMM32_B32 : SOPK_IMM32 < - sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs), - (ins i32imm:$imm, hwreg:$simm16), " $simm16, $imm" ->; - -//===----------------------------------------------------------------------===// -// SOPP Instructions -//===----------------------------------------------------------------------===// - -def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; - -let isTerminator = 1 in { - -def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", - [(AMDGPUendpgm)]> { - let simm16 = 0; - let isBarrier = 1; - let hasCtrlDep = 1; - let hasSideEffects = 1; -} - -let isBranch = 1 in { -def S_BRANCH : SOPP < - 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", - [(br bb:$simm16)]> { - let isBarrier = 1; -} - -let Uses = [SCC] in { -def S_CBRANCH_SCC0 : SOPP < - 0x00000004, (ins sopp_brtarget:$simm16), - "s_cbranch_scc0 $simm16" ->; -def S_CBRANCH_SCC1 : SOPP < - 0x00000005, (ins sopp_brtarget:$simm16), - "s_cbranch_scc1 $simm16", - [(si_uniform_br_scc SCC, bb:$simm16)] ->; -} // End Uses = [SCC] - -let Uses = [VCC] in { -def S_CBRANCH_VCCZ : SOPP < - 0x00000006, (ins sopp_brtarget:$simm16), - "s_cbranch_vccz $simm16" ->; -def S_CBRANCH_VCCNZ : SOPP < - 0x00000007, (ins sopp_brtarget:$simm16), - "s_cbranch_vccnz $simm16" ->; -} // End Uses = [VCC] - -let Uses = [EXEC] in { -def S_CBRANCH_EXECZ : SOPP < - 0x00000008, (ins sopp_brtarget:$simm16), - "s_cbranch_execz $simm16" ->; -def S_CBRANCH_EXECNZ : SOPP < - 0x00000009, (ins sopp_brtarget:$simm16), - "s_cbranch_execnz $simm16" ->; -} // End Uses = [EXEC] - - -} // End isBranch = 1 -} // End isTerminator = 1 - -let hasSideEffects = 1 in { -def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", - [(int_amdgcn_s_barrier)] -> { - let SchedRW = [WriteBarrier]; - let simm16 = 0; - let mayLoad = 1; - let mayStore = 1; - let isConvergent = 1; -} - -let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in -def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; -def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; - -// On SI the documentation says sleep for approximately 64 * low 2 -// bits, consistent with the reported maximum of 448. On VI the -// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the -// maximum really 15 on VI? -def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16), - "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> { - let hasSideEffects = 1; - let mayLoad = 1; - let mayStore = 1; -} - -def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">; - -let Uses = [EXEC, M0] in { - // FIXME: Should this be mayLoad+mayStore? - def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", - [(AMDGPUsendmsg (i32 imm:$simm16))] - >; -} // End Uses = [EXEC, M0] - -def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16">; -def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; -def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { - let simm16 = 0; -} -def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">; -def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">; -def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { - let simm16 = 0; -} -} // End hasSideEffects - -//===----------------------------------------------------------------------===// -// VOPC Instructions -//===----------------------------------------------------------------------===// - -let isCompare = 1, isCommutable = 1 in { - -defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">; -defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">; -defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>; -defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">; -defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>; -defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>; -defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>; -defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>; -defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>; -defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">; -defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>; -defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">; -defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>; -defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>; -defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>; -defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">; - - -defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">; -defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32", "v_cmpx_gt_f32">; -defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">; -defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32", "v_cmpx_ge_f32">; -defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">; -defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">; -defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">; -defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17, 0x57>, "v_cmpx_o_f32">; -defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18, 0x58>, "v_cmpx_u_f32">; -defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19, 0x59>, "v_cmpx_nge_f32">; -defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a, 0x5a>, "v_cmpx_nlg_f32">; -defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b, 0x5b>, "v_cmpx_ngt_f32">; -defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c, 0x5c>, "v_cmpx_nle_f32">; -defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">; -defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">; -defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">; - - -defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">; -defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">; -defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>; -defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">; -defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>; -defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>; -defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>; -defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>; -defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>; -defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">; -defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>; -defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">; -defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>; -defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>; -defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>; -defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">; - - -defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">; -defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64", "v_cmpx_gt_f64">; -defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">; -defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64", "v_cmpx_ge_f64">; -defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">; -defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">; -defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">; -defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">; -defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">; -defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64", "v_cmpx_nle_f64">; -defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">; -defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64", "v_cmpx_nlt_f64">; -defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">; -defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">; -defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">; -defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">; - - -let SubtargetPredicate = isSICI in { - -defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">; -defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">; -defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">; -defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">; -defm V_CMPS_GT_F32 : VOPC_F32 <vopc<0x44>, "v_cmps_gt_f32">; -defm V_CMPS_LG_F32 : VOPC_F32 <vopc<0x45>, "v_cmps_lg_f32">; -defm V_CMPS_GE_F32 : VOPC_F32 <vopc<0x46>, "v_cmps_ge_f32">; -defm V_CMPS_O_F32 : VOPC_F32 <vopc<0x47>, "v_cmps_o_f32">; -defm V_CMPS_U_F32 : VOPC_F32 <vopc<0x48>, "v_cmps_u_f32">; -defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">; -defm V_CMPS_NLG_F32 : VOPC_F32 <vopc<0x4a>, "v_cmps_nlg_f32">; -defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">; -defm V_CMPS_NLE_F32 : VOPC_F32 <vopc<0x4c>, "v_cmps_nle_f32">; -defm V_CMPS_NEQ_F32 : VOPC_F32 <vopc<0x4d>, "v_cmps_neq_f32">; -defm V_CMPS_NLT_F32 : VOPC_F32 <vopc<0x4e>, "v_cmps_nlt_f32">; -defm V_CMPS_TRU_F32 : VOPC_F32 <vopc<0x4f>, "v_cmps_tru_f32">; - - -defm V_CMPSX_F_F32 : VOPCX_F32 <vopc<0x50>, "v_cmpsx_f_f32">; -defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32", "v_cmpsx_gt_f32">; -defm V_CMPSX_EQ_F32 : VOPCX_F32 <vopc<0x52>, "v_cmpsx_eq_f32">; -defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32", "v_cmpsx_ge_f32">; -defm V_CMPSX_GT_F32 : VOPCX_F32 <vopc<0x54>, "v_cmpsx_gt_f32">; -defm V_CMPSX_LG_F32 : VOPCX_F32 <vopc<0x55>, "v_cmpsx_lg_f32">; -defm V_CMPSX_GE_F32 : VOPCX_F32 <vopc<0x56>, "v_cmpsx_ge_f32">; -defm V_CMPSX_O_F32 : VOPCX_F32 <vopc<0x57>, "v_cmpsx_o_f32">; -defm V_CMPSX_U_F32 : VOPCX_F32 <vopc<0x58>, "v_cmpsx_u_f32">; -defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32", "v_cmpsx_nle_f32">; -defm V_CMPSX_NLG_F32 : VOPCX_F32 <vopc<0x5a>, "v_cmpsx_nlg_f32">; -defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">; -defm V_CMPSX_NLE_F32 : VOPCX_F32 <vopc<0x5c>, "v_cmpsx_nle_f32">; -defm V_CMPSX_NEQ_F32 : VOPCX_F32 <vopc<0x5d>, "v_cmpsx_neq_f32">; -defm V_CMPSX_NLT_F32 : VOPCX_F32 <vopc<0x5e>, "v_cmpsx_nlt_f32">; -defm V_CMPSX_TRU_F32 : VOPCX_F32 <vopc<0x5f>, "v_cmpsx_tru_f32">; - - -defm V_CMPS_F_F64 : VOPC_F64 <vopc<0x60>, "v_cmps_f_f64">; -defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">; -defm V_CMPS_EQ_F64 : VOPC_F64 <vopc<0x62>, "v_cmps_eq_f64">; -defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">; -defm V_CMPS_GT_F64 : VOPC_F64 <vopc<0x64>, "v_cmps_gt_f64">; -defm V_CMPS_LG_F64 : VOPC_F64 <vopc<0x65>, "v_cmps_lg_f64">; -defm V_CMPS_GE_F64 : VOPC_F64 <vopc<0x66>, "v_cmps_ge_f64">; -defm V_CMPS_O_F64 : VOPC_F64 <vopc<0x67>, "v_cmps_o_f64">; -defm V_CMPS_U_F64 : VOPC_F64 <vopc<0x68>, "v_cmps_u_f64">; -defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">; -defm V_CMPS_NLG_F64 : VOPC_F64 <vopc<0x6a>, "v_cmps_nlg_f64">; -defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">; -defm V_CMPS_NLE_F64 : VOPC_F64 <vopc<0x6c>, "v_cmps_nle_f64">; -defm V_CMPS_NEQ_F64 : VOPC_F64 <vopc<0x6d>, "v_cmps_neq_f64">; -defm V_CMPS_NLT_F64 : VOPC_F64 <vopc<0x6e>, "v_cmps_nlt_f64">; -defm V_CMPS_TRU_F64 : VOPC_F64 <vopc<0x6f>, "v_cmps_tru_f64">; - - -defm V_CMPSX_F_F64 : VOPCX_F64 <vopc<0x70>, "v_cmpsx_f_f64">; -defm V_CMPSX_LT_F64 : VOPCX_F64 <vopc<0x71>, "v_cmpsx_lt_f64", "v_cmpsx_gt_f64">; -defm V_CMPSX_EQ_F64 : VOPCX_F64 <vopc<0x72>, "v_cmpsx_eq_f64">; -defm V_CMPSX_LE_F64 : VOPCX_F64 <vopc<0x73>, "v_cmpsx_le_f64", "v_cmpsx_ge_f64">; -defm V_CMPSX_GT_F64 : VOPCX_F64 <vopc<0x74>, "v_cmpsx_gt_f64">; -defm V_CMPSX_LG_F64 : VOPCX_F64 <vopc<0x75>, "v_cmpsx_lg_f64">; -defm V_CMPSX_GE_F64 : VOPCX_F64 <vopc<0x76>, "v_cmpsx_ge_f64">; -defm V_CMPSX_O_F64 : VOPCX_F64 <vopc<0x77>, "v_cmpsx_o_f64">; -defm V_CMPSX_U_F64 : VOPCX_F64 <vopc<0x78>, "v_cmpsx_u_f64">; -defm V_CMPSX_NGE_F64 : VOPCX_F64 <vopc<0x79>, "v_cmpsx_nge_f64", "v_cmpsx_nle_f64">; -defm V_CMPSX_NLG_F64 : VOPCX_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">; -defm V_CMPSX_NGT_F64 : VOPCX_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">; -defm V_CMPSX_NLE_F64 : VOPCX_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">; -defm V_CMPSX_NEQ_F64 : VOPCX_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">; -defm V_CMPSX_NLT_F64 : VOPCX_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">; -defm V_CMPSX_TRU_F64 : VOPCX_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">; - -} // End SubtargetPredicate = isSICI - -defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">; -defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; -defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>; -defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">; -defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>; -defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>; -defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>; -defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">; - - -defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">; -defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32", "v_cmpx_gt_i32">; -defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">; -defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32", "v_cmpx_ge_i32">; -defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">; -defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">; -defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">; -defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">; - - -defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">; -defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">; -defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>; -defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">; -defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>; -defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>; -defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>; -defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">; - - -defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">; -defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64", "v_cmpx_gt_i64">; -defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">; -defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64", "v_cmpx_ge_i64">; -defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">; -defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">; -defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">; -defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">; - - -defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">; -defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">; -defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>; -defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">; -defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>; -defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>; -defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>; -defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">; - - -defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">; -defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32", "v_cmpx_gt_u32">; -defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">; -defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32", "v_cmpx_le_u32">; -defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">; -defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">; -defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">; -defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">; - - -defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">; -defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">; -defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>; -defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">; -defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>; -defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>; -defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>; -defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">; - -defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">; -defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64", "v_cmpx_gt_u64">; -defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">; -defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64", "v_cmpx_ge_u64">; -defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">; -defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">; -defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">; -defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">; - -} // End isCompare = 1, isCommutable = 1 - -defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">; -defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">; -defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">; -defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">; - -//===----------------------------------------------------------------------===// -// DS Instructions -//===----------------------------------------------------------------------===// - -defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; -defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; -defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; -defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>; -defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>; -defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>; -defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>; -defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>; -defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; -defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; -defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; -defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; -defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>; -let mayLoad = 0 in { -defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>; -defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; -defm DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; -} -defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; -defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; -defm DS_MIN_F32 : DS_1A2D_NORET <0x12, "ds_min_f32", VGPR_32>; -defm DS_MAX_F32 : DS_1A2D_NORET <0x13, "ds_max_f32", VGPR_32>; - -defm DS_GWS_INIT : DS_1A_GDS <0x19, "ds_gws_init">; -defm DS_GWS_SEMA_V : DS_1A_GDS <0x1a, "ds_gws_sema_v">; -defm DS_GWS_SEMA_BR : DS_1A_GDS <0x1b, "ds_gws_sema_br">; -defm DS_GWS_SEMA_P : DS_1A_GDS <0x1c, "ds_gws_sema_p">; -defm DS_GWS_BARRIER : DS_1A_GDS <0x1d, "ds_gws_barrier">; -let mayLoad = 0 in { -defm DS_WRITE_B8 : DS_1A1D_NORET <0x1e, "ds_write_b8", VGPR_32>; -defm DS_WRITE_B16 : DS_1A1D_NORET <0x1f, "ds_write_b16", VGPR_32>; -} -defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; -defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; -defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; -defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; -defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">; -defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">; -defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">; -defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">; -defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">; -defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; -defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; -defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; -defm DS_MSKOR_RTN_B32 : DS_1A2D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; -defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; -defm DS_WRXCHG2_RTN_B32 : DS_1A2D_RET < - 0x2e, "ds_wrxchg2_rtn_b32", VReg_64, "", VGPR_32 ->; -defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET < - 0x2f, "ds_wrxchg2st64_rtn_b32", VReg_64, "", VGPR_32 ->; -defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; -defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; -defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; -defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; - -let Uses = [EXEC], mayLoad =0, mayStore = 0, isConvergent = 1 in { -defm DS_SWIZZLE_B32 : DS_1A_RET_ <dsop<0x35, 0x3d>, "ds_swizzle_b32", VGPR_32>; -} - -let mayStore = 0 in { -defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; -defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>; -defm DS_READ2ST64_B32 : DS_1A_Off8_RET <0x38, "ds_read2st64_b32", VReg_64>; -defm DS_READ_I8 : DS_1A_RET <0x39, "ds_read_i8", VGPR_32>; -defm DS_READ_U8 : DS_1A_RET <0x3a, "ds_read_u8", VGPR_32>; -defm DS_READ_I16 : DS_1A_RET <0x3b, "ds_read_i16", VGPR_32>; -defm DS_READ_U16 : DS_1A_RET <0x3c, "ds_read_u16", VGPR_32>; -} -defm DS_CONSUME : DS_0A_RET <0x3d, "ds_consume">; -defm DS_APPEND : DS_0A_RET <0x3e, "ds_append">; -defm DS_ORDERED_COUNT : DS_1A_RET_GDS <0x3f, "ds_ordered_count">; -defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; -defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; -defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; -defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; -defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; -defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; -defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; -defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; -defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; -defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; -defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; -defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; -defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>; -let mayLoad = 0 in { -defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>; -defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; -defm DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; -} -defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; -defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; -defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; -defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; - -defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; -defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; -defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; -defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; -defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; -defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; -defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; -defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; -defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; -defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; -defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; -defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; -defm DS_MSKOR_RTN_B64 : DS_1A2D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; -defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; -defm DS_WRXCHG2_RTN_B64 : DS_1A2D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_128, "ds_wrxchg2_b64", VReg_64>; -defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET <0x6f, "ds_wrxchg2st64_rtn_b64", VReg_128, "ds_wrxchg2st64_b64", VReg_64>; -defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; -defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; -defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">; -defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">; - -let mayStore = 0 in { -defm DS_READ_B64 : DS_1A_RET <0x76, "ds_read_b64", VReg_64>; -defm DS_READ2_B64 : DS_1A_Off8_RET <0x77, "ds_read2_b64", VReg_128>; -defm DS_READ2ST64_B64 : DS_1A_Off8_RET <0x78, "ds_read2st64_b64", VReg_128>; -} - -defm DS_ADD_SRC2_U32 : DS_1A <0x80, "ds_add_src2_u32">; -defm DS_SUB_SRC2_U32 : DS_1A <0x81, "ds_sub_src2_u32">; -defm DS_RSUB_SRC2_U32 : DS_1A <0x82, "ds_rsub_src2_u32">; -defm DS_INC_SRC2_U32 : DS_1A <0x83, "ds_inc_src2_u32">; -defm DS_DEC_SRC2_U32 : DS_1A <0x84, "ds_dec_src2_u32">; -defm DS_MIN_SRC2_I32 : DS_1A <0x85, "ds_min_src2_i32">; -defm DS_MAX_SRC2_I32 : DS_1A <0x86, "ds_max_src2_i32">; -defm DS_MIN_SRC2_U32 : DS_1A <0x87, "ds_min_src2_u32">; -defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">; -defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">; -defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">; -defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">; -defm DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET <0x8d, "ds_write_src2_b32">; - -defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">; -defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">; - -defm DS_ADD_SRC2_U64 : DS_1A <0xc0, "ds_add_src2_u64">; -defm DS_SUB_SRC2_U64 : DS_1A <0xc1, "ds_sub_src2_u64">; -defm DS_RSUB_SRC2_U64 : DS_1A <0xc2, "ds_rsub_src2_u64">; -defm DS_INC_SRC2_U64 : DS_1A <0xc3, "ds_inc_src2_u64">; -defm DS_DEC_SRC2_U64 : DS_1A <0xc4, "ds_dec_src2_u64">; -defm DS_MIN_SRC2_I64 : DS_1A <0xc5, "ds_min_src2_i64">; -defm DS_MAX_SRC2_I64 : DS_1A <0xc6, "ds_max_src2_i64">; -defm DS_MIN_SRC2_U64 : DS_1A <0xc7, "ds_min_src2_u64">; -defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">; -defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">; -defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">; -defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">; -defm DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET <0xcd, "ds_write_src2_b64">; - -defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; -defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; - -//===----------------------------------------------------------------------===// -// MUBUF Instructions -//===----------------------------------------------------------------------===// - -defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper < - mubuf<0x00>, "buffer_load_format_x", VGPR_32 ->; -defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper < - mubuf<0x01>, "buffer_load_format_xy", VReg_64 ->; -defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper < - mubuf<0x02>, "buffer_load_format_xyz", VReg_96 ->; -defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper < - mubuf<0x03>, "buffer_load_format_xyzw", VReg_128 ->; -defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper < - mubuf<0x04>, "buffer_store_format_x", VGPR_32 ->; -defm BUFFER_STORE_FORMAT_XY : MUBUF_Store_Helper < - mubuf<0x05>, "buffer_store_format_xy", VReg_64 ->; -defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Store_Helper < - mubuf<0x06>, "buffer_store_format_xyz", VReg_96 ->; -defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper < - mubuf<0x07>, "buffer_store_format_xyzw", VReg_128 ->; -defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < - mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8 ->; -defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < - mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8 ->; -defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < - mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16 ->; -defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < - mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16 ->; -defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < - mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load ->; -defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < - mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load ->; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < - mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load ->; - -defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < - mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global ->; - -defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < - mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global ->; - -defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < - mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store ->; - -defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < - mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store ->; - -defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < - mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store ->; - -defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < - mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global ->; -defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Atomic < - mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag ->; -defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < - mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global ->; -defm BUFFER_ATOMIC_SUB : MUBUF_Atomic < - mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global ->; -//def BUFFER_ATOMIC_RSUB : MUBUF_ <mubuf<0x34>, "buffer_atomic_rsub", []>; // isn't on CI & VI -defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic < - mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global ->; -defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic < - mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global ->; -defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic < - mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global ->; -defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic < - mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global ->; -defm BUFFER_ATOMIC_AND : MUBUF_Atomic < - mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global ->; -defm BUFFER_ATOMIC_OR : MUBUF_Atomic < - mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global ->; -defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < - mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global ->; -defm BUFFER_ATOMIC_INC : MUBUF_Atomic < - mubuf<0x3c, 0x4b>, "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global ->; -defm BUFFER_ATOMIC_DEC : MUBUF_Atomic < - mubuf<0x3d, 0x4c>, "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global ->; - -//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_Atomic <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI -//def BUFFER_ATOMIC_FMIN : MUBUF_Atomic <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI -//def BUFFER_ATOMIC_FMAX : MUBUF_Atomic <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI -defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Atomic < - mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global ->; -defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic < - mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag ->; -defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Atomic < - mubuf<0x52, 0x62>, "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global ->; -defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Atomic < - mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global ->; -//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Atomic <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI -defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Atomic < - mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global ->; -defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Atomic < - mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global ->; -defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Atomic < - mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global ->; -defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Atomic < - mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global ->; -defm BUFFER_ATOMIC_AND_X2 : MUBUF_Atomic < - mubuf<0x59, 0x68>, "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global ->; -defm BUFFER_ATOMIC_OR_X2 : MUBUF_Atomic < - mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global ->; -defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Atomic < - mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global ->; -defm BUFFER_ATOMIC_INC_X2 : MUBUF_Atomic < - mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global ->; -defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Atomic < - mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global ->; -//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI -//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI -//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI - -let SubtargetPredicate = isSI, DisableVIDecoder = 1 in { -defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI -} - -defm BUFFER_WBINVL1 : MUBUF_Invalidate <mubuf<0x71, 0x3e>, "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; - -//===----------------------------------------------------------------------===// -// MTBUF Instructions -//===----------------------------------------------------------------------===// - -//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>; -//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>; -//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>; - -//===----------------------------------------------------------------------===// -// MIMG Instructions -//===----------------------------------------------------------------------===// - -defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">; -defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; -//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>; -//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; -//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; -//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; -defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">; -defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">; -//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; -//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; -defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; -defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">; -defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>; -defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">; -defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">; -//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI -defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimg<0x14>, "image_atomic_smin">; -defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimg<0x15>, "image_atomic_umin">; -defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimg<0x16>, "image_atomic_smax">; -defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimg<0x17>, "image_atomic_umax">; -defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimg<0x18>, "image_atomic_and">; -defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">; -defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">; -defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">; -defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">; -//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI -//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI -//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI -defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; -defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; -defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; -defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; -defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; -defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">; -defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; -defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; -defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">; -defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; -defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">; -defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; -defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; -defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">; -defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">; -defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; -defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; -defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">; -defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; -defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">; -defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">; -defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; -defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">; -defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">; -defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; -defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">; -defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">; -defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; -defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">; -defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; -defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">; -defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">; -defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; -defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">; -defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; -defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; -defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">; -defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; -defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; -defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; -defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; -defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; -defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; -defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">; -defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">; -defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">; -defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">; -defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">; -//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; -//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; - -//===----------------------------------------------------------------------===// -// VOP1 Instructions -//===----------------------------------------------------------------------===// - -let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { -defm V_NOP : VOP1Inst <vop1<0x0>, "v_nop", VOP_NONE>; -} - -let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { -defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>; -} // End isMoveImm = 1 - -let Uses = [EXEC] in { - -// FIXME: Specify SchedRW for READFIRSTLANE_B32 - -def V_READFIRSTLANE_B32 : VOP1 < - 0x00000002, - (outs SReg_32:$vdst), - (ins VS_32:$src0), - "v_readfirstlane_b32 $vdst, $src0", - [] -> { - let isConvergent = 1; -} - -} - -let SchedRW = [WriteQuarterRate32] in { - -defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64", - VOP_I32_F64, fp_to_sint ->; -defm V_CVT_F64_I32 : VOP1Inst <vop1<0x4>, "v_cvt_f64_i32", - VOP_F64_I32, sint_to_fp ->; -defm V_CVT_F32_I32 : VOP1Inst <vop1<0x5>, "v_cvt_f32_i32", - VOP_F32_I32, sint_to_fp ->; -defm V_CVT_F32_U32 : VOP1Inst <vop1<0x6>, "v_cvt_f32_u32", - VOP_F32_I32, uint_to_fp ->; -defm V_CVT_U32_F32 : VOP1Inst <vop1<0x7>, "v_cvt_u32_f32", - VOP_I32_F32, fp_to_uint ->; -defm V_CVT_I32_F32 : VOP1Inst <vop1<0x8>, "v_cvt_i32_f32", - VOP_I32_F32, fp_to_sint ->; -defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32", - VOP_I32_F32, fp_to_f16 ->; -defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16", - VOP_F32_I32, f16_to_fp ->; -defm V_CVT_RPI_I32_F32 : VOP1Inst <vop1<0xc>, "v_cvt_rpi_i32_f32", - VOP_I32_F32, cvt_rpi_i32_f32>; -defm V_CVT_FLR_I32_F32 : VOP1Inst <vop1<0xd>, "v_cvt_flr_i32_f32", - VOP_I32_F32, cvt_flr_i32_f32>; -defm V_CVT_OFF_F32_I4 : VOP1Inst <vop1<0x0e>, "v_cvt_off_f32_i4", VOP_F32_I32>; -defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64", - VOP_F32_F64, fround ->; -defm V_CVT_F64_F32 : VOP1Inst <vop1<0x10>, "v_cvt_f64_f32", - VOP_F64_F32, fextend ->; -defm V_CVT_F32_UBYTE0 : VOP1Inst <vop1<0x11>, "v_cvt_f32_ubyte0", - VOP_F32_I32, AMDGPUcvt_f32_ubyte0 ->; -defm V_CVT_F32_UBYTE1 : VOP1Inst <vop1<0x12>, "v_cvt_f32_ubyte1", - VOP_F32_I32, AMDGPUcvt_f32_ubyte1 ->; -defm V_CVT_F32_UBYTE2 : VOP1Inst <vop1<0x13>, "v_cvt_f32_ubyte2", - VOP_F32_I32, AMDGPUcvt_f32_ubyte2 ->; -defm V_CVT_F32_UBYTE3 : VOP1Inst <vop1<0x14>, "v_cvt_f32_ubyte3", - VOP_F32_I32, AMDGPUcvt_f32_ubyte3 ->; -defm V_CVT_U32_F64 : VOP1Inst <vop1<0x15>, "v_cvt_u32_f64", - VOP_I32_F64, fp_to_uint ->; -defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32", - VOP_F64_I32, uint_to_fp ->; - -} // End SchedRW = [WriteQuarterRate32] - -defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32", - VOP_F32_F32, AMDGPUfract ->; -defm V_TRUNC_F32 : VOP1Inst <vop1<0x21, 0x1c>, "v_trunc_f32", - VOP_F32_F32, ftrunc ->; -defm V_CEIL_F32 : VOP1Inst <vop1<0x22, 0x1d>, "v_ceil_f32", - VOP_F32_F32, fceil ->; -defm V_RNDNE_F32 : VOP1Inst <vop1<0x23, 0x1e>, "v_rndne_f32", - VOP_F32_F32, frint ->; -defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32", - VOP_F32_F32, ffloor ->; -defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32", - VOP_F32_F32, fexp2 ->; - -let SchedRW = [WriteQuarterRate32] in { - -defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32", - VOP_F32_F32, flog2 ->; -defm V_RCP_F32 : VOP1Inst <vop1<0x2a, 0x22>, "v_rcp_f32", - VOP_F32_F32, AMDGPUrcp ->; -defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32", - VOP_F32_F32 ->; -defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32", - VOP_F32_F32, AMDGPUrsq ->; - -} // End SchedRW = [WriteQuarterRate32] - -let SchedRW = [WriteDouble] in { - -defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64", - VOP_F64_F64, AMDGPUrcp ->; -defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64", - VOP_F64_F64, AMDGPUrsq ->; - -} // End SchedRW = [WriteDouble]; - -defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32", - VOP_F32_F32, fsqrt ->; - -let SchedRW = [WriteDouble] in { - -defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64", - VOP_F64_F64, fsqrt ->; - -} // End SchedRW = [WriteDouble] - -let SchedRW = [WriteQuarterRate32] in { - -defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32", - VOP_F32_F32, AMDGPUsin ->; -defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32", - VOP_F32_F32, AMDGPUcos ->; - -} // End SchedRW = [WriteQuarterRate32] - -defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>; -defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>; -defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>; -defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>; -defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>; -defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64", - VOP_I32_F64, int_amdgcn_frexp_exp ->; - -let SchedRW = [WriteDoubleAdd] in { -defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64", - VOP_F64_F64, int_amdgcn_frexp_mant ->; - -defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", - VOP_F64_F64, AMDGPUfract ->; -} // End SchedRW = [WriteDoubleAdd] - - -defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32", - VOP_I32_F32, int_amdgcn_frexp_exp ->; -defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32", - VOP_F32_F32, int_amdgcn_frexp_mant ->; -let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { -defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NO_EXT<VOP_NONE>>; -} - -let Uses = [M0, EXEC] in { -defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_NO_EXT<VOP_I32_I32>>; -defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_NO_EXT<VOP_I32_I32>>; -defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>; -} // End Uses = [M0, EXEC] - -// These instruction only exist on SI and CI -let SubtargetPredicate = isSICI in { - -let SchedRW = [WriteQuarterRate32] in { - -defm V_MOV_FED_B32 : VOP1InstSI <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>; -defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", - VOP_F32_F32, int_amdgcn_log_clamp>; -defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>; -defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>; -defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32", - VOP_F32_F32, AMDGPUrsq_clamp ->; -defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32", - VOP_F32_F32, AMDGPUrsq_legacy ->; - -} // End SchedRW = [WriteQuarterRate32] - -let SchedRW = [WriteDouble] in { - -defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>; -defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64", - VOP_F64_F64, AMDGPUrsq_clamp ->; - -} // End SchedRW = [WriteDouble] - -} // End SubtargetPredicate = isSICI +defm EXP : EXP_m<0, AMDGPUexport>; +defm EXP_DONE : EXP_m<1, AMDGPUexport_done>; //===----------------------------------------------------------------------===// // VINTRP Instructions @@ -1433,11 +50,11 @@ let Uses = [M0, EXEC] in { multiclass V_INTERP_P1_F32_m : VINTRP_m < 0x00000000, - (outs VGPR_32:$dst), - (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr), - "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]", - [(set f32:$dst, (AMDGPUinterp_p1 i32:$i, (i32 imm:$attr_chan), - (i32 imm:$attr)))] + (outs VGPR_32:$vdst), + (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), + "v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan", + [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan), + (i32 imm:$attr)))] >; let OtherPredicates = [has32BankLDS] in { @@ -1446,459 +63,33 @@ defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; } // End OtherPredicates = [has32BankLDS] -let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1 in { +let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in { defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; -} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1 +} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 -let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in { +let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { defm V_INTERP_P2_F32 : VINTRP_m < 0x00000001, - (outs VGPR_32:$dst), - (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr), - "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [m0]", - [(set f32:$dst, (AMDGPUinterp_p2 f32:$src0, i32:$j, (i32 imm:$attr_chan), - (i32 imm:$attr)))]>; + (outs VGPR_32:$vdst), + (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), + "v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan", + [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan), + (i32 imm:$attr)))]>; -} // End DisableEncoding = "$src0", Constraints = "$src0 = $dst" +} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst" defm V_INTERP_MOV_F32 : VINTRP_m < 0x00000002, - (outs VGPR_32:$dst), - (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr), - "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [m0]", - [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan), - (i32 imm:$attr)))]>; - -} // End Uses = [M0, EXEC] - -//===----------------------------------------------------------------------===// -// VOP2 Instructions -//===----------------------------------------------------------------------===// - -defm V_CNDMASK_B32 : VOP2eInst <vop2<0x0, 0x0>, "v_cndmask_b32", - VOP2e_I32_I32_I32_I1 ->; - -let isCommutable = 1 in { -defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32", - VOP_F32_F32_F32, fadd ->; - -defm V_SUB_F32 : VOP2Inst <vop2<0x4, 0x2>, "v_sub_f32", VOP_F32_F32_F32, fsub>; -defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32", - VOP_F32_F32_F32, null_frag, "v_sub_f32" ->; -} // End isCommutable = 1 - -let isCommutable = 1 in { - -defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32", - VOP_F32_F32_F32 ->; - -defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32", - VOP_F32_F32_F32, fmul ->; - -defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9, 0x6>, "v_mul_i32_i24", - VOP_I32_I32_I32, AMDGPUmul_i24 ->; - -defm V_MUL_HI_I32_I24 : VOP2Inst <vop2<0xa,0x7>, "v_mul_hi_i32_i24", - VOP_I32_I32_I32 ->; - -defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb, 0x8>, "v_mul_u32_u24", - VOP_I32_I32_I32, AMDGPUmul_u24 ->; - -defm V_MUL_HI_U32_U24 : VOP2Inst <vop2<0xc,0x9>, "v_mul_hi_u32_u24", - VOP_I32_I32_I32 ->; - -defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32, - fminnum>; -defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32, - fmaxnum>; -defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32>; -defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32>; -defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32>; -defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32>; - -defm V_LSHRREV_B32 : VOP2Inst < - vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, - "v_lshr_b32" ->; - -defm V_ASHRREV_I32 : VOP2Inst < - vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, - "v_ashr_i32" ->; - -defm V_LSHLREV_B32 : VOP2Inst < - vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, - "v_lshl_b32" ->; - -defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>; -defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>; -defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>; - -let Constraints = "$vdst = $src2", DisableEncoding="$src2", - isConvertibleToThreeAddress = 1 in { -defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_MAC>; -} -} // End isCommutable = 1 - -defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32", VOP_MADMK>; - -let isCommutable = 1 in { -defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32", VOP_MADAK>; -} // End isCommutable = 1 - -let isCommutable = 1 in { -// No patterns so that the scalar instructions are always selected. -// The scalar versions will be replaced with vector when needed later. - -// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, -// but the VI instructions behave the same as the SI versions. -defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32", - VOP2b_I32_I1_I32_I32 ->; -defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP2b_I32_I1_I32_I32>; - -defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32", - VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32" ->; - -defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32", - VOP2b_I32_I1_I32_I32_I1 ->; -defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32", - VOP2b_I32_I1_I32_I32_I1 ->; -defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32", - VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32" ->; - -} // End isCommutable = 1 - -// These are special and do not read the exec mask. -let isConvergent = 1, Uses = []<Register> in { - -defm V_READLANE_B32 : VOP2SI_3VI_m < - vop3 <0x001, 0x289>, - "v_readlane_b32", - (outs SReg_32:$vdst), - (ins VS_32:$src0, SCSrc_32:$src1), - "v_readlane_b32 $vdst, $src0, $src1" ->; - -defm V_WRITELANE_B32 : VOP2SI_3VI_m < - vop3 <0x002, 0x28a>, - "v_writelane_b32", (outs VGPR_32:$vdst), - (ins SReg_32:$src0, SCSrc_32:$src1), - "v_writelane_b32 $vdst, $src0, $src1" ->; - -} // End isConvergent = 1 - -// These instructions only exist on SI and CI -let SubtargetPredicate = isSICI in { - -let isCommutable = 1 in { -defm V_MAC_LEGACY_F32 : VOP2InstSI <vop2<0x6>, "v_mac_legacy_f32", - VOP_F32_F32_F32 ->; -} // End isCommutable = 1 - -defm V_MIN_LEGACY_F32 : VOP2InstSI <vop2<0xd>, "v_min_legacy_f32", - VOP_F32_F32_F32, AMDGPUfmin_legacy ->; -defm V_MAX_LEGACY_F32 : VOP2InstSI <vop2<0xe>, "v_max_legacy_f32", - VOP_F32_F32_F32, AMDGPUfmax_legacy ->; - -let isCommutable = 1 in { -defm V_LSHR_B32 : VOP2InstSI <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32>; -defm V_ASHR_I32 : VOP2InstSI <vop2<0x17>, "v_ashr_i32", VOP_I32_I32_I32>; -defm V_LSHL_B32 : VOP2InstSI <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32>; -} // End isCommutable = 1 -} // End let SubtargetPredicate = SICI - -defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", - VOP_I32_I32_I32 ->; -defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32", - VOP_I32_I32_I32 ->; -defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32", - VOP_I32_I32_I32, int_amdgcn_mbcnt_lo ->; -defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32", - VOP_I32_I32_I32, int_amdgcn_mbcnt_hi ->; -defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32", - VOP_F32_F32_I32, AMDGPUldexp ->; - -defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst <vop23<0x2c, 0x1f0>, "v_cvt_pkaccum_u8_f32", - VOP_I32_F32_I32>; // TODO: set "Uses = dst" - -defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst <vop23<0x2d, 0x294>, "v_cvt_pknorm_i16_f32", - VOP_I32_F32_F32 ->; -defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst <vop23<0x2e, 0x295>, "v_cvt_pknorm_u16_f32", - VOP_I32_F32_F32 ->; -defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32", - VOP_I32_F32_F32, int_SI_packf16 ->; -defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst <vop23<0x30, 0x297>, "v_cvt_pk_u16_u32", - VOP_I32_I32_I32 ->; -defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst <vop23<0x31, 0x298>, "v_cvt_pk_i16_i32", - VOP_I32_I32_I32 ->; - -//===----------------------------------------------------------------------===// -// VOP3 Instructions -//===----------------------------------------------------------------------===// - -let isCommutable = 1 in { -defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140, 0x1c0>, "v_mad_legacy_f32", - VOP_F32_F32_F32_F32 ->; - -defm V_MAD_F32 : VOP3Inst <vop3<0x141, 0x1c1>, "v_mad_f32", - VOP_F32_F32_F32_F32, fmad ->; - -defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142, 0x1c2>, "v_mad_i32_i24", - VOP_I32_I32_I32_I32, AMDGPUmad_i24 ->; -defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24", - VOP_I32_I32_I32_I32, AMDGPUmad_u24 ->; -} // End isCommutable = 1 - -defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32", - VOP_F32_F32_F32_F32, int_amdgcn_cubeid ->; -defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32", - VOP_F32_F32_F32_F32, int_amdgcn_cubesc ->; -defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32", - VOP_F32_F32_F32_F32, int_amdgcn_cubetc ->; -defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32", - VOP_F32_F32_F32_F32, int_amdgcn_cubema ->; - -defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32", - VOP_I32_I32_I32_I32, AMDGPUbfe_u32 ->; -defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32", - VOP_I32_I32_I32_I32, AMDGPUbfe_i32 ->; - -defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32", - VOP_I32_I32_I32_I32, AMDGPUbfi ->; - -let isCommutable = 1 in { -defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32", - VOP_F32_F32_F32_F32, fma ->; -defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64", - VOP_F64_F64_F64_F64, fma ->; - -defm V_LERP_U8 : VOP3Inst <vop3<0x14d, 0x1cd>, "v_lerp_u8", - VOP_I32_I32_I32_I32, int_amdgcn_lerp ->; -} // End isCommutable = 1 - -//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; -defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e, 0x1ce>, "v_alignbit_b32", - VOP_I32_I32_I32_I32 ->; -defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32", - VOP_I32_I32_I32_I32 ->; - -defm V_MIN3_F32 : VOP3Inst <vop3<0x151, 0x1d0>, "v_min3_f32", - VOP_F32_F32_F32_F32, AMDGPUfmin3>; - -defm V_MIN3_I32 : VOP3Inst <vop3<0x152, 0x1d1>, "v_min3_i32", - VOP_I32_I32_I32_I32, AMDGPUsmin3 ->; -defm V_MIN3_U32 : VOP3Inst <vop3<0x153, 0x1d2>, "v_min3_u32", - VOP_I32_I32_I32_I32, AMDGPUumin3 ->; -defm V_MAX3_F32 : VOP3Inst <vop3<0x154, 0x1d3>, "v_max3_f32", - VOP_F32_F32_F32_F32, AMDGPUfmax3 ->; -defm V_MAX3_I32 : VOP3Inst <vop3<0x155, 0x1d4>, "v_max3_i32", - VOP_I32_I32_I32_I32, AMDGPUsmax3 ->; -defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32", - VOP_I32_I32_I32_I32, AMDGPUumax3 ->; -defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32", - VOP_F32_F32_F32_F32, AMDGPUfmed3 ->; -defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32", - VOP_I32_I32_I32_I32, AMDGPUsmed3 ->; -defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32", - VOP_I32_I32_I32_I32, AMDGPUumed3 ->; - -//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; -//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>; -//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>; -defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32", - VOP_I32_I32_I32_I32 ->; -//def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; -defm V_DIV_FIXUP_F32 : VOP3Inst < - vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup ->; - -let SchedRW = [WriteDoubleAdd] in { - -defm V_DIV_FIXUP_F64 : VOP3Inst < - vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup ->; - -} // End SchedRW = [WriteDouble] - -let SchedRW = [WriteDoubleAdd] in { -let isCommutable = 1 in { - -defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64", - VOP_F64_F64_F64, fadd, 1 ->; -defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64", - VOP_F64_F64_F64, fmul, 1 ->; - -defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64", - VOP_F64_F64_F64, fminnum, 1 ->; -defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64", - VOP_F64_F64_F64, fmaxnum, 1 ->; - -} // End isCommutable = 1 - -defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64", - VOP_F64_F64_I32, AMDGPUldexp, 1 ->; - -} // End let SchedRW = [WriteDoubleAdd] - -let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { - -defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32", - VOP_I32_I32_I32 ->; -defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32", - VOP_I32_I32_I32, mulhu ->; - -let DisableVIDecoder=1 in { // removed from VI as identical to V_MUL_LO_U32 -defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32", - VOP_I32_I32_I32 ->; -} - -defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32", - VOP_I32_I32_I32, mulhs ->; - -} // End isCommutable = 1, SchedRW = [WriteQuarterRate32] - -let SchedRW = [WriteFloatFMA, WriteSALU] in { -defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32", - VOP3b_F32_I1_F32_F32_F32, [], 1 ->; -} - -let SchedRW = [WriteDouble, WriteSALU] in { -// Double precision division pre-scale. -defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64", - VOP3b_F64_I1_F64_F64_F64, [], 1 ->; -} // End SchedRW = [WriteDouble] - -let isCommutable = 1, Uses = [VCC, EXEC] in { - -let SchedRW = [WriteFloatFMA] in { -// v_div_fmas_f32: -// result = src0 * src1 + src2 -// if (vcc) -// result *= 2^32 -// -defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32", - VOP_F32_F32_F32_F32, AMDGPUdiv_fmas ->; -} - -let SchedRW = [WriteDouble] in { -// v_div_fmas_f64: -// result = src0 * src1 + src2 -// if (vcc) -// result *= 2^64 -// -defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64", - VOP_F64_F64_F64_F64, AMDGPUdiv_fmas ->; - -} // End SchedRW = [WriteDouble] -} // End isCommutable = 1, Uses = [VCC, EXEC] - -//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; -//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; -//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; - -let SchedRW = [WriteDouble] in { -defm V_TRIG_PREOP_F64 : VOP3Inst < - vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop ->; - -} // End SchedRW = [WriteDouble] - -// These instructions only exist on SI and CI -let SubtargetPredicate = isSICI in { + (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan), + "v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan", + [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan), + (i32 imm:$attr)))]>; -defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", VOP_I64_I64_I32>; -defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", VOP_I64_I64_I32>; -defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", VOP_I64_I64_I32>; - -defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32", - VOP_F32_F32_F32_F32>; - -} // End SubtargetPredicate = isSICI - -let SubtargetPredicate = isVI, DisableSIDecoder = 1 in { - -defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64", - VOP_I64_I32_I64 ->; -defm V_LSHRREV_B64 : VOP3Inst <vop3<0, 0x290>, "v_lshrrev_b64", - VOP_I64_I32_I64 ->; -defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64", - VOP_I64_I32_I64 ->; - -} // End SubtargetPredicate = isVI +} // End Uses = [M0, EXEC] //===----------------------------------------------------------------------===// // Pseudo Instructions @@ -1908,16 +99,16 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // For use in patterns def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), - (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []> { + (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> { let isPseudo = 1; let isCodeGenOnly = 1; + let usesCustomInserter = 1; } // 64-bit vector move instruction. This is mainly used by the SIFoldOperands // pass to enable folding of inline immediates. -def V_MOV_B64_PSEUDO : PseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_64:$src0)> { - let VALU = 1; -} +def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), + (ins VSrc_b64:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] let usesCustomInserter = 1, SALU = 1 in { @@ -1925,83 +116,142 @@ def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; } // End let usesCustomInserter = 1, SALU = 1 +def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst), + (ins SSrc_b64:$src0)> { + let SALU = 1; + let isAsCheapAsAMove = 1; + let isTerminator = 1; +} + +def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst), + (ins SSrc_b64:$src0, SSrc_b64:$src1)> { + let SALU = 1; + let isAsCheapAsAMove = 1; + let isTerminator = 1; +} + +def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst), + (ins SSrc_b64:$src0, SSrc_b64:$src1)> { + let SALU = 1; + let isAsCheapAsAMove = 1; + let isTerminator = 1; +} + +def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), + [(int_amdgcn_wave_barrier)]> { + let SchedRW = []; + let hasNoSchedulingInfo = 1; + let hasSideEffects = 1; + let mayLoad = 1; + let mayStore = 1; + let isBarrier = 1; + let isConvergent = 1; +} + // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. -let hasSideEffects = 1 in { - // Dummy terminator instruction to use after control flow instructions // replaced with exec mask operations. def SI_MASK_BRANCH : PseudoInstSI < - (outs), (ins brtarget:$target, SReg_64:$dst)> { - let isBranch = 1; + (outs), (ins brtarget:$target)> { + let isBranch = 0; let isTerminator = 1; - let isBarrier = 1; - let SALU = 1; + let isBarrier = 0; + let Uses = [EXEC]; + let SchedRW = []; + let hasNoSchedulingInfo = 1; } -let Uses = [EXEC], Defs = [EXEC, SCC] in { - -let isBranch = 1, isTerminator = 1 in { +let isTerminator = 1 in { -def SI_IF: PseudoInstSI < +def SI_IF: CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), - [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))]> { + [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> { let Constraints = ""; + let Size = 12; + let mayLoad = 1; + let mayStore = 1; + let hasSideEffects = 1; } -def SI_ELSE : PseudoInstSI < - (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target), - [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]> { +def SI_ELSE : CFPseudoInstSI < + (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { let Constraints = "$src = $dst"; + let Size = 12; + let mayStore = 1; + let mayLoad = 1; + let hasSideEffects = 1; } -def SI_LOOP : PseudoInstSI < +def SI_LOOP : CFPseudoInstSI < (outs), (ins SReg_64:$saved, brtarget:$target), - [(int_amdgcn_loop i64:$saved, bb:$target)] ->; + [(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> { + let Size = 8; + let isBranch = 1; + let hasSideEffects = 1; + let mayLoad = 1; + let mayStore = 1; +} } // End isBranch = 1, isTerminator = 1 +def SI_END_CF : CFPseudoInstSI < + (outs), (ins SReg_64:$saved), + [(int_amdgcn_end_cf i64:$saved)], 1, 1> { + let Size = 4; + let isAsCheapAsAMove = 1; + let isReMaterializable = 1; + let mayLoad = 1; + let mayStore = 1; + let hasSideEffects = 1; +} -def SI_BREAK : PseudoInstSI < +def SI_BREAK : CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$src), - [(set i64:$dst, (int_amdgcn_break i64:$src))] ->; + [(set i64:$dst, (int_amdgcn_break i64:$src))], 1> { + let Size = 4; + let isAsCheapAsAMove = 1; + let isReMaterializable = 1; +} -def SI_IF_BREAK : PseudoInstSI < +def SI_IF_BREAK : CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), - [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))] ->; + [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> { + let Size = 4; + let isAsCheapAsAMove = 1; + let isReMaterializable = 1; +} -def SI_ELSE_BREAK : PseudoInstSI < +def SI_ELSE_BREAK : CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), - [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))] ->; - -def SI_END_CF : PseudoInstSI < - (outs), (ins SReg_64:$saved), - [(int_amdgcn_end_cf i64:$saved)] ->; - -} // End Uses = [EXEC], Defs = [EXEC, SCC] + [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> { + let Size = 4; + let isAsCheapAsAMove = 1; + let isReMaterializable = 1; +} let Uses = [EXEC], Defs = [EXEC,VCC] in { def SI_KILL : PseudoInstSI < - (outs), (ins VSrc_32:$src), - [(int_AMDGPU_kill f32:$src)]> { + (outs), (ins VSrc_b32:$src), + [(AMDGPUkill i32:$src)]> { let isConvergent = 1; let usesCustomInserter = 1; } -def SI_KILL_TERMINATOR : PseudoInstSI < - (outs), (ins VSrc_32:$src)> { +def SI_KILL_TERMINATOR : SPseudoInstSI < + (outs), (ins VSrc_b32:$src)> { let isTerminator = 1; } } // End Uses = [EXEC], Defs = [EXEC,VCC] -} // End mayLoad = 1, mayStore = 1, hasSideEffects = 1 +// Branch on undef scc. Used to avoid intermediate copy from +// IMPLICIT_DEF to SCC. +def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> { + let isTerminator = 1; + let usesCustomInserter = 1; +} def SI_PS_LIVE : PseudoInstSI < (outs SReg_64:$dst), (ins), @@ -2013,36 +263,37 @@ def SI_PS_LIVE : PseudoInstSI < // s_mov_b32 rather than a copy of another initialized // register. MachineCSE skips copies, and we don't want to have to // fold operands before it runs. -def SI_INIT_M0 : PseudoInstSI <(outs), (ins SSrc_32:$src)> { +def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { let Defs = [M0]; let usesCustomInserter = 1; let isAsCheapAsAMove = 1; - let SALU = 1; let isReMaterializable = 1; } -def SI_RETURN : PseudoInstSI < +def SI_RETURN : SPseudoInstSI < (outs), (ins variable_ops), [(AMDGPUreturn)]> { let isTerminator = 1; let isBarrier = 1; let isReturn = 1; let hasSideEffects = 1; - let SALU = 1; let hasNoSchedulingInfo = 1; let DisableWQM = 1; } -let Uses = [EXEC], Defs = [EXEC, VCC, M0], +let Defs = [M0, EXEC], UseNamedOperandTable = 1 in { -class SI_INDIRECT_SRC<RegisterClass rc> : PseudoInstSI < - (outs VGPR_32:$vdst, SReg_64:$sdst), - (ins rc:$src, VS_32:$idx, i32imm:$offset)>; +class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI < + (outs VGPR_32:$vdst), + (ins rc:$src, VS_32:$idx, i32imm:$offset)> { + let usesCustomInserter = 1; +} -class SI_INDIRECT_DST<RegisterClass rc> : PseudoInstSI < - (outs rc:$vdst, SReg_64:$sdst), - (ins unknown:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { +class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI < + (outs rc:$vdst), + (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { let Constraints = "$src = $vdst"; + let usesCustomInserter = 1; } // TODO: We can support indirect SGPR access. @@ -2058,53 +309,60 @@ def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; -} // End Uses = [EXEC], Defs = [EXEC,VCC,M0] +} // End Uses = [EXEC], Defs = [M0, EXEC] multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { - let UseNamedOperandTable = 1, Uses = [EXEC] in { + let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in { def _SAVE : PseudoInstSI < (outs), - (ins sgpr_class:$src, i32imm:$frame_idx)> { + (ins sgpr_class:$data, i32imm:$addr)> { let mayStore = 1; let mayLoad = 0; } def _RESTORE : PseudoInstSI < - (outs sgpr_class:$dst), - (ins i32imm:$frame_idx)> { + (outs sgpr_class:$data), + (ins i32imm:$addr)> { let mayStore = 0; let mayLoad = 1; } } // End UseNamedOperandTable = 1 } -// It's unclear whether you can use M0 as the output of v_readlane_b32 -// instructions, so use SReg_32_XM0 register class for spills to prevent -// this from happening. -defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32_XM0>; +// You cannot use M0 as the output of v_readlane_b32 instructions or +// use it in the sdata operand of SMEM instructions. We still need to +// be able to spill the physical register m0, so allow it for +// SI_SPILL_32_* instructions. +defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { - let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in { - def _SAVE : PseudoInstSI < + let UseNamedOperandTable = 1, VGPRSpill = 1, + SchedRW = [WriteVMEM] in { + def _SAVE : VPseudoInstSI < (outs), - (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, - SReg_32:$scratch_offset, i32imm:$offset)> { + (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc, + SReg_32:$soffset, i32imm:$offset)> { let mayStore = 1; let mayLoad = 0; + // (2 * 4) + (8 * num_subregs) bytes maximum + let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); } - def _RESTORE : PseudoInstSI < - (outs vgpr_class:$dst), - (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset, + def _RESTORE : VPseudoInstSI < + (outs vgpr_class:$vdata), + (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset, i32imm:$offset)> { let mayStore = 0; let mayLoad = 1; + + // (2 * 4) + (8 * num_subregs) bytes maximum + let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); } - } // End UseNamedOperandTable = 1, VGPRSpill = 1 + } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] } defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; @@ -2114,344 +372,26 @@ defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; -let Defs = [SCC] in { - -def SI_PC_ADD_REL_OFFSET : PseudoInstSI < +def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < (outs SReg_64:$dst), - (ins si_ga:$ptr), - [(set SReg_64:$dst, (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr))))]> { - let SALU = 1; + (ins si_ga:$ptr_lo, si_ga:$ptr_hi), + [(set SReg_64:$dst, + (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> { + let Defs = [SCC]; } -} // End Defs = [SCC] - } // End SubtargetPredicate = isGCN let Predicates = [isGCN] in { -def : Pat < - (int_AMDGPU_kilp), - (SI_KILL 0xbf800000) ->; - -/* int_SI_vs_load_input */ -def : Pat< - (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), - (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) ->; - -def : Pat < - (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, - f32:$src0, f32:$src1, f32:$src2, f32:$src3), - (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, - $src0, $src1, $src2, $src3) ->; - -//===----------------------------------------------------------------------===// -// buffer_load/store_format patterns -//===----------------------------------------------------------------------===// - -multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, - string opcode> { - def : Pat< - (vt (name v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$glc, imm:$slc)), - (!cast<MUBUF>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) - >; - - def : Pat< - (vt (name v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$glc, imm:$slc)), - (!cast<MUBUF>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) - >; - - def : Pat< - (vt (name v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$glc, imm:$slc)), - (!cast<MUBUF>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) - >; - - def : Pat< - (vt (name v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$glc, imm:$slc)), - (!cast<MUBUF>(opcode # _BOTHEN) - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) - >; -} - -defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">; -defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">; -defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">; -defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, f32, "BUFFER_LOAD_DWORD">; -defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v2f32, "BUFFER_LOAD_DWORDX2">; -defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v4f32, "BUFFER_LOAD_DWORDX4">; - -multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, - string opcode> { - def : Pat< - (name vt:$vdata, v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$glc, imm:$slc), - (!cast<MUBUF>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) - >; - - def : Pat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$glc, imm:$slc), - (!cast<MUBUF>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), 0) - >; - - def : Pat< - (name vt:$vdata, v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$glc, imm:$slc), - (!cast<MUBUF>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), 0) - >; - - def : Pat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$glc, imm:$slc), - (!cast<MUBUF>(opcode # _BOTHEN_exact) - $vdata, - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) - >; -} - -defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">; -defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">; -defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">; -defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">; -defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">; -defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">; - -//===----------------------------------------------------------------------===// -// buffer_atomic patterns -//===----------------------------------------------------------------------===// -multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> { - def : Pat< - (name i32:$vdata_in, v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$slc), - (!cast<MUBUF>(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $slc)) - >; - - def : Pat< - (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$slc), - (!cast<MUBUF>(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $slc)) - >; - - def : Pat< - (name i32:$vdata_in, v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$slc), - (!cast<MUBUF>(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $slc)) - >; - - def : Pat< - (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$slc), - (!cast<MUBUF>(opcode # _RTN_BOTHEN) - $vdata_in, - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) - >; -} - -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_swap, "BUFFER_ATOMIC_SWAP">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_add, "BUFFER_ATOMIC_ADD">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_sub, "BUFFER_ATOMIC_SUB">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smin, "BUFFER_ATOMIC_SMIN">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umin, "BUFFER_ATOMIC_UMIN">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smax, "BUFFER_ATOMIC_SMAX">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umax, "BUFFER_ATOMIC_UMAX">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_and, "BUFFER_ATOMIC_AND">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_or, "BUFFER_ATOMIC_OR">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_xor, "BUFFER_ATOMIC_XOR">; - def : Pat< - (int_amdgcn_buffer_atomic_cmpswap - i32:$data, i32:$cmp, v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$slc), - (EXTRACT_SUBREG - (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET - (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), - sub0) + (int_amdgcn_else i64:$src, bb:$target), + (SI_ELSE $src, $target, 0) >; -def : Pat< - (int_amdgcn_buffer_atomic_cmpswap - i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$slc), - (EXTRACT_SUBREG - (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN - (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), - sub0) ->; - -def : Pat< - (int_amdgcn_buffer_atomic_cmpswap - i32:$data, i32:$cmp, v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$slc), - (EXTRACT_SUBREG - (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN - (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), - sub0) ->; - -def : Pat< - (int_amdgcn_buffer_atomic_cmpswap - i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$slc), - (EXTRACT_SUBREG - (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN - (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), - sub0) ->; - - -//===----------------------------------------------------------------------===// -// S_GETREG_B32 Intrinsic Pattern. -//===----------------------------------------------------------------------===// def : Pat < - (int_amdgcn_s_getreg imm:$simm16), - (S_GETREG_B32 (as_i16imm $simm16)) ->; - -//===----------------------------------------------------------------------===// -// DS_SWIZZLE Intrinsic Pattern. -//===----------------------------------------------------------------------===// -def : Pat < - (int_amdgcn_ds_swizzle i32:$src, imm:$offset16), - (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0)) ->; - -//===----------------------------------------------------------------------===// -// SMRD Patterns -//===----------------------------------------------------------------------===// - -multiclass SMRD_Pattern <string Instr, ValueType vt> { - - // 1. IMM offset - def : Pat < - (smrd_load (SMRDImm i64:$sbase, i32:$offset)), - (vt (!cast<SMRD>(Instr#"_IMM") $sbase, $offset)) - >; - - // 2. SGPR offset - def : Pat < - (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), - (vt (!cast<SMRD>(Instr#"_SGPR") $sbase, $offset)) - >; - - def : Pat < - (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), - (vt (!cast<SMRD>(Instr#"_IMM_ci") $sbase, $offset)) - > { - let Predicates = [isCIOnly]; - } -} - -// Global and constant loads can be selected to either MUBUF or SMRD -// instructions, but SMRD instructions are faster so we want the instruction -// selector to prefer those. -let AddedComplexity = 100 in { - -defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; - -// 1. Offset as an immediate -def : Pat < - (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), - (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset) ->; - -// 2. Offset loaded in an 32bit SGPR -def : Pat < - (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)), - (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset) ->; - -let Predicates = [isCI] in { - -def : Pat < - (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), - (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset) ->; - -} // End Predicates = [isCI] - -} // End let AddedComplexity = 10000 - -//===----------------------------------------------------------------------===// -// SOP1 Patterns -//===----------------------------------------------------------------------===// - -def : Pat < - (i64 (ctpop i64:$src)), - (i64 (REG_SEQUENCE SReg_64, - (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, - (S_MOV_B32 0), sub1)) ->; - -def : Pat < - (i32 (smax i32:$x, (i32 (ineg i32:$x)))), - (S_ABS_I32 $x) ->; - -//===----------------------------------------------------------------------===// -// SOP2 Patterns -//===----------------------------------------------------------------------===// - -// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector -// case, the sgpr-copies pass will fix this to use the vector version. -def : Pat < - (i32 (addc i32:$src0, i32:$src1)), - (S_ADD_U32 $src0, $src1) ->; - -//===----------------------------------------------------------------------===// -// SOPP Patterns -//===----------------------------------------------------------------------===// - -def : Pat < - (int_amdgcn_s_waitcnt i32:$simm16), - (S_WAITCNT (as_i16imm $simm16)) + (int_AMDGPU_kilp), + (SI_KILL (i32 0xbf800000)) >; //===----------------------------------------------------------------------===// @@ -2483,308 +423,79 @@ def : Pat < } // End Predicates = [UnsafeFPMath] -//===----------------------------------------------------------------------===// -// VOP2 Patterns -//===----------------------------------------------------------------------===// - def : Pat < - (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), - (V_BCNT_U32_B32_e64 $popcnt, $val) + (f32 (fpextend f16:$src)), + (V_CVT_F32_F16_e32 $src) >; def : Pat < - (i32 (select i1:$src0, i32:$src1, i32:$src2)), - (V_CNDMASK_B32_e64 $src2, $src1, $src0) + (f64 (fpextend f16:$src)), + (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) >; -// Pattern for V_MAC_F32 def : Pat < - (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3NoMods f32:$src1, i32:$src1_modifiers), - (VOP3NoMods f32:$src2, i32:$src2_modifiers)), - (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, - $src2_modifiers, $src2, $clamp, $omod) ->; - -/********** ======================= **********/ -/********** Image sampling patterns **********/ -/********** ======================= **********/ - -// Image + sampler -class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, - i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), - (opcode $addr, $rsrc, $sampler, - (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) ->; - -multiclass SampleRawPatterns<SDPatternOperator name, string opcode> { - def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; - def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; - def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; - def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>; - def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>; -} - -// Image only -class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm, - imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe), - (opcode $addr, $rsrc, - (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) ->; - -multiclass ImagePatterns<SDPatternOperator name, string opcode> { - def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; - def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; - def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; -} - -class ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$r128, imm:$da, imm:$glc, - imm:$slc), - (opcode $addr, $rsrc, - (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $r128), 0, 0, (as_i1imm $da)) ->; - -multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> { - def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; - def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; - def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; -} - -class ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < - (name v4f32:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, imm:$r128, imm:$da, - imm:$glc, imm:$slc), - (opcode $data, $addr, $rsrc, - (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $r128), 0, 0, (as_i1imm $da)) + (f16 (fpround f32:$src)), + (V_CVT_F16_F32_e32 $src) >; -multiclass ImageStorePatterns<SDPatternOperator name, string opcode> { - def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; - def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; - def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; -} - -class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < - (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), - (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)) ->; - -multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> { - def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>; - def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>; - def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>; -} - -class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat < - (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc, - imm:$r128, imm:$da, imm:$slc), - (EXTRACT_SUBREG - (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1), - $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)), - sub0) ->; - -// Basic sample -defm : SampleRawPatterns<int_SI_image_sample, "IMAGE_SAMPLE">; -defm : SampleRawPatterns<int_SI_image_sample_cl, "IMAGE_SAMPLE_CL">; -defm : SampleRawPatterns<int_SI_image_sample_d, "IMAGE_SAMPLE_D">; -defm : SampleRawPatterns<int_SI_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">; -defm : SampleRawPatterns<int_SI_image_sample_l, "IMAGE_SAMPLE_L">; -defm : SampleRawPatterns<int_SI_image_sample_b, "IMAGE_SAMPLE_B">; -defm : SampleRawPatterns<int_SI_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">; -defm : SampleRawPatterns<int_SI_image_sample_lz, "IMAGE_SAMPLE_LZ">; -defm : SampleRawPatterns<int_SI_image_sample_cd, "IMAGE_SAMPLE_CD">; -defm : SampleRawPatterns<int_SI_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">; - -// Sample with comparison -defm : SampleRawPatterns<int_SI_image_sample_c, "IMAGE_SAMPLE_C">; -defm : SampleRawPatterns<int_SI_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">; -defm : SampleRawPatterns<int_SI_image_sample_c_d, "IMAGE_SAMPLE_C_D">; -defm : SampleRawPatterns<int_SI_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">; -defm : SampleRawPatterns<int_SI_image_sample_c_l, "IMAGE_SAMPLE_C_L">; -defm : SampleRawPatterns<int_SI_image_sample_c_b, "IMAGE_SAMPLE_C_B">; -defm : SampleRawPatterns<int_SI_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">; -defm : SampleRawPatterns<int_SI_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">; -defm : SampleRawPatterns<int_SI_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">; -defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">; - -// Sample with offsets -defm : SampleRawPatterns<int_SI_image_sample_o, "IMAGE_SAMPLE_O">; -defm : SampleRawPatterns<int_SI_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_d_o, "IMAGE_SAMPLE_D_O">; -defm : SampleRawPatterns<int_SI_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_l_o, "IMAGE_SAMPLE_L_O">; -defm : SampleRawPatterns<int_SI_image_sample_b_o, "IMAGE_SAMPLE_B_O">; -defm : SampleRawPatterns<int_SI_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">; -defm : SampleRawPatterns<int_SI_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">; -defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">; - -// Sample with comparison and offsets -defm : SampleRawPatterns<int_SI_image_sample_c_o, "IMAGE_SAMPLE_C_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">; - -// Gather opcodes -// Only the variants which make sense are defined. -def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>; -def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_cl, IMAGE_GATHER4_CL_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_l, IMAGE_GATHER4_L_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_b, IMAGE_GATHER4_B_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V2, v2i32>; -def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V4, v4i32>; - -def : SampleRawPattern<int_SI_gather4_c, IMAGE_GATHER4_C_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_b_cl, IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_lz, IMAGE_GATHER4_C_LZ_V4_V4, v4i32>; - -def : SampleRawPattern<int_SI_gather4_o, IMAGE_GATHER4_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_b_cl_o, IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_lz_o, IMAGE_GATHER4_LZ_O_V4_V4, v4i32>; - -def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_cl_o, IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_l_o, IMAGE_GATHER4_C_L_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_b_o, IMAGE_GATHER4_C_B_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_b_cl_o, IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>; - -def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>; -def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>; -def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>; - -def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>; -defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">; -defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">; -defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">; -defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">; -defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">; -defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">; -def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>; -def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>; -def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">; - -/* SIsample for simple 1D texture lookup */ def : Pat < - (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm), - (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) + (f16 (fpround f64:$src)), + (V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src)) >; -class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm), - (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) ->; - -class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT), - (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0) +def : Pat < + (i32 (fp_to_sint f16:$src)), + (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src)) >; -class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY), - (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) +def : Pat < + (i32 (fp_to_uint f16:$src)), + (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src)) >; -class SampleShadowPattern<SDNode name, MIMG opcode, - ValueType vt> : Pat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW), - (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) +def : Pat < + (f16 (sint_to_fp i32:$src)), + (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src)) >; -class SampleShadowArrayPattern<SDNode name, MIMG opcode, - ValueType vt> : Pat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), - (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) +def : Pat < + (f16 (uint_to_fp i32:$src)), + (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src)) >; -/* SIsample* for texture lookups consuming more address parameters */ -multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l, - MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b, -MIMG sample_d, MIMG sample_c_d, ValueType addr_type> { - def : SamplePattern <SIsample, sample, addr_type>; - def : SampleRectPattern <SIsample, sample, addr_type>; - def : SampleArrayPattern <SIsample, sample, addr_type>; - def : SampleShadowPattern <SIsample, sample_c, addr_type>; - def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>; +//===----------------------------------------------------------------------===// +// VOP2 Patterns +//===----------------------------------------------------------------------===// - def : SamplePattern <SIsamplel, sample_l, addr_type>; - def : SampleArrayPattern <SIsamplel, sample_l, addr_type>; - def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>; - def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>; +multiclass FMADPat <ValueType vt, Instruction inst> { + def : Pat < + (vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods vt:$src1, i32:$src1_modifiers), + (VOP3NoMods vt:$src2, i32:$src2_modifiers))), + (inst $src0_modifiers, $src0, $src1_modifiers, $src1, + $src2_modifiers, $src2, $clamp, $omod) + >; +} - def : SamplePattern <SIsampleb, sample_b, addr_type>; - def : SampleArrayPattern <SIsampleb, sample_b, addr_type>; - def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>; - def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>; +defm : FMADPat <f16, V_MAC_F16_e64>; +defm : FMADPat <f32, V_MAC_F32_e64>; - def : SamplePattern <SIsampled, sample_d, addr_type>; - def : SampleArrayPattern <SIsampled, sample_d, addr_type>; - def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>; - def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>; +multiclass SelectPat <ValueType vt, Instruction inst> { + def : Pat < + (vt (select i1:$src0, vt:$src1, vt:$src2)), + (inst $src2, $src1, $src0) + >; } -defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2, - IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2, - IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2, - IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2, - v2i32>; -defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4, - IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4, - IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4, - IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4, - v4i32>; -defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8, - IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8, - IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8, - IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8, - v8i32>; -defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16, - IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16, - IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16, - IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16, - v16i32>; +defm : SelectPat <i16, V_CNDMASK_B32_e64>; +defm : SelectPat <i32, V_CNDMASK_B32_e64>; +defm : SelectPat <f16, V_CNDMASK_B32_e64>; +defm : SelectPat <f32, V_CNDMASK_B32_e64>; + +def : Pat < + (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), + (V_BCNT_U32_B32_e64 $popcnt, $val) +>; /********** ============================================ **********/ /********** Extraction, Insertion, Building and Casting **********/ @@ -2856,6 +567,12 @@ foreach Index = 0-15 in { // FIXME: Why do only some of these type combinations for SReg and // VReg? +// 16-bit bitcast +def : BitConvert <i16, f16, VGPR_32>; +def : BitConvert <f16, i16, VGPR_32>; +def : BitConvert <i16, f16, SReg_32>; +def : BitConvert <f16, i16, SReg_32>; + // 32-bit bitcast def : BitConvert <i32, f32, VGPR_32>; def : BitConvert <f32, i32, VGPR_32>; @@ -2905,7 +622,7 @@ def : BitConvert <v16f32, v16i32, VReg_512>; def : Pat < (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod) + (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod) >; /********** ================================ **********/ @@ -2916,7 +633,7 @@ def : Pat < def : Pat < (fneg (fabs f32:$src)), - (S_OR_B32 $src, 0x80000000) // Set sign bit + (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit >; // FIXME: Should use S_OR_B32 @@ -2925,19 +642,19 @@ def : Pat < (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x80000000)), // Set sign bit. + (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit. sub1) >; def : Pat < (fabs f32:$src), - (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) + (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff))) >; def : Pat < (fneg f32:$src), - (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) + (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000))) >; def : Pat < @@ -2945,8 +662,8 @@ def : Pat < (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. + (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit. sub1) >; @@ -2955,33 +672,66 @@ def : Pat < (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x80000000)), + (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (i32 (V_MOV_B32_e32 (i32 0x80000000)))), sub1) >; +def : Pat < + (fneg f16:$src), + (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000))) +>; + +def : Pat < + (fabs f16:$src), + (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff))) +>; + +def : Pat < + (fneg (fabs f16:$src)), + (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit +>; + /********** ================== **********/ /********** Immediate Patterns **********/ /********** ================== **********/ def : Pat < - (SGPRImm<(i32 imm)>:$imm), - (S_MOV_B32 imm:$imm) + (VGPRImm<(i32 imm)>:$imm), + (V_MOV_B32_e32 imm:$imm) >; def : Pat < - (SGPRImm<(f32 fpimm)>:$imm), - (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) + (VGPRImm<(f32 fpimm)>:$imm), + (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) >; def : Pat < (i32 imm:$imm), - (V_MOV_B32_e32 imm:$imm) + (S_MOV_B32 imm:$imm) +>; + +// FIXME: Workaround for ordering issue with peephole optimizer where +// a register class copy interferes with immediate folding. Should +// use s_mov_b32, which can be shrunk to s_movk_i32 +def : Pat < + (VGPRImm<(f16 fpimm)>:$imm), + (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm))) >; def : Pat < (f32 fpimm:$imm), - (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) + (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) +>; + +def : Pat < + (f16 fpimm:$imm), + (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) +>; + +def : Pat < + (i32 frameindex:$fi), + (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi))) >; def : Pat < @@ -3011,21 +761,21 @@ def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; def : Pat < (int_AMDGPU_cube v4f32:$src), (REG_SEQUENCE VReg_128, - (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1), - 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2), + (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)), + 0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)), + 0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)), 0 /* clamp */, 0 /* omod */), sub0, - (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2), + (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), + 0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), 0 /* clamp */, 0 /* omod */), sub1, - (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), 0 /* clamp */, 0 /* omod */), sub2, - (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), 0 /* clamp */, 0 /* omod */), sub3) >; @@ -3042,17 +792,11 @@ class Ext32Pat <SDNode ext> : Pat < def : Ext32Pat <zext>; def : Ext32Pat <anyext>; -// Offset in an 32-bit VGPR -def : Pat < - (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) ->; - // The multiplication scales from [0,1] to the unsigned integer range def : Pat < (AMDGPUurecip i32:$src0), (V_CVT_U32_F32_e32 - (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1, + (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1), (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; @@ -3066,245 +810,8 @@ def : UMad24Pat<V_MAD_U32_U24>; defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; def : ROTRPattern <V_ALIGNBIT_B32>; -/********** ======================= **********/ -/********** Load/Store Patterns **********/ -/********** ======================= **********/ - -class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat < - (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst $ptr, (as_i16imm $offset), (i1 0)) ->; - -def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>; -def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>; -def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>; -def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>; -def : DSReadPat <DS_READ_B32, i32, si_load_local>; - -let AddedComplexity = 100 in { - -def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>; - -} // End AddedComplexity = 100 - -def : Pat < - (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, - i8:$offset1))), - (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) ->; - -class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat < - (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) ->; - -def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>; -def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>; -def : DSWritePat <DS_WRITE_B32, i32, si_store_local>; - -let AddedComplexity = 100 in { - -def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>; -} // End AddedComplexity = 100 - -def : Pat < - (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, - i8:$offset1)), - (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), - (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, - (i1 0)) ->; - -class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) ->; - -class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) ->; - - -// 32-bit atomics. -def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>; -def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>; -def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>; -def : DSAtomicRetPat<DS_INC_RTN_U32, i32, si_atomic_inc_local>; -def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, si_atomic_dec_local>; -def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>; -def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>; -def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>; -def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>; -def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>; -def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>; -def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>; -def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>; - -// 64-bit atomics. -def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>; -def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>; -def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>; -def : DSAtomicRetPat<DS_INC_RTN_U64, i64, si_atomic_inc_local>; -def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, si_atomic_dec_local>; -def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>; -def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>; -def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>; -def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>; -def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>; -def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>; -def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>; - -def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>; - - -//===----------------------------------------------------------------------===// -// MUBUF Patterns -//===----------------------------------------------------------------------===// - -class MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, - PatFrag constant_ld> : Pat < - (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) - >; - -multiclass MUBUFLoad_Atomic_Pattern <MUBUF Instr_ADDR64, MUBUF Instr_OFFSET, - ValueType vt, PatFrag atomic_ld> { - def : Pat < - (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$slc))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) - >; - - def : Pat < - (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) - >; -} - -let Predicates = [isSICI] in { -def : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; -def : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; -def : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; -def : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>; - -defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>; -defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>; -} // End Predicates = [isSICI] - -class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < - (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))), - (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) ->; - -def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>; - -// BUFFER_LOAD_DWORD*, addr64=0 -multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen, - MUBUF bothen> { - - def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, - imm:$offset, 0, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 1, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; - - def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 0, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, - imm:$offset, 1, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; -} - -defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN, - BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>; -defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN, - BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>; -defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN, - BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>; - -multiclass MUBUFStore_Atomic_Pattern <MUBUF Instr_ADDR64, MUBUF Instr_OFFSET, - ValueType vt, PatFrag atomic_st> { - // Store follows atomic op convention so address is forst - def : Pat < - (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$slc), vt:$val), - (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) - >; - - def : Pat < - (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) - >; -} -let Predicates = [isSICI] in { -defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, global_store_atomic>; -defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>; -} // End Predicates = [isSICI] - -class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat < - (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, - u16imm:$offset)), - (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) ->; - -def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>; - -//===----------------------------------------------------------------------===// -// MTBUF Patterns -//===----------------------------------------------------------------------===// - -// TBUFFER_STORE_FORMAT_*, addr64=0 -class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat< - (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, - i32:$soffset, imm:$inst_offset, imm:$dfmt, - imm:$nfmt, imm:$offen, imm:$idxen, - imm:$glc, imm:$slc, imm:$tfe), - (opcode - $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen), - (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc, - (as_i1imm $slc), (as_i1imm $tfe), $soffset) ->; - -def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>; -def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>; -def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>; -def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; - /********** ====================== **********/ -/********** Indirect adressing **********/ +/********** Indirect addressing **********/ /********** ====================== **********/ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { @@ -3332,48 +839,80 @@ defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; //===----------------------------------------------------------------------===// +// SAD Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (add (sub_oneuse (umax i32:$src0, i32:$src1), + (umin i32:$src0, i32:$src1)), + i32:$src2), + (V_SAD_U32 $src0, $src1, $src2) +>; + +def : Pat < + (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)), + (sub i32:$src0, i32:$src1), + (sub i32:$src1, i32:$src0)), + i32:$src2), + (V_SAD_U32 $src0, $src1, $src2) +>; + +//===----------------------------------------------------------------------===// // Conversion Patterns //===----------------------------------------------------------------------===// def : Pat<(i32 (sext_inreg i32:$src, i1)), - (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 + (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 // Handle sext_inreg in i64 def : Pat < (i64 (sext_inreg i64:$src, i1)), - (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 + (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16 +>; + +def : Pat < + (i16 (sext_inreg i16:$src, i1)), + (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16 +>; + +def : Pat < + (i16 (sext_inreg i16:$src, i8)), + (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i8)), - (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 + (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i16)), - (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 + (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i32)), - (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 + (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 >; -class ZExt_i64_i32_Pat <SDNode ext> : Pat < - (i64 (ext i32:$src)), - (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1) +def : Pat < + (i64 (zext i32:$src)), + (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) +>; + +def : Pat < + (i64 (anyext i32:$src)), + (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) >; class ZExt_i64_i1_Pat <SDNode ext> : Pat < (i64 (ext i1:$src)), (REG_SEQUENCE VReg_64, (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, - (S_MOV_B32 0), sub1) + (S_MOV_B32 (i32 0)), sub1) >; -def : ZExt_i64_i32_Pat<zext>; -def : ZExt_i64_i32_Pat<anyext>; def : ZExt_i64_i1_Pat<zext>; def : ZExt_i64_i1_Pat<anyext>; @@ -3382,29 +921,29 @@ def : ZExt_i64_i1_Pat<anyext>; def : Pat < (i64 (sext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, - (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1) + (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) >; def : Pat < (i64 (sext i1:$src)), (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 0, -1, $src), sub0, - (V_CNDMASK_B32_e64 0, -1, $src), sub1) + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0, + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1) >; -class FPToI1Pat<Instruction Inst, int KOne, ValueType vt, SDPatternOperator fp_to_int> : Pat < +class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat < (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), - (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) + (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) >; -def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, f32, fp_to_uint>; -def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, f32, fp_to_sint>; -def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, f64, fp_to_uint>; -def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, f64, fp_to_sint>; +def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; +def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; +def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>; +def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>; // If we need to perform a logical operation on i1 values, we need to // use vector comparisons since there is only one SCC register. Vector -// comparisions still write to a pair of SGPRs, so treat these as +// comparisons still write to a pair of SGPRs, so treat these as // 64-bit comparisons. When legalizing SGPR copies, instructions // resulting in the copies from SCC to these instructions will be // moved to the VALU. @@ -3425,12 +964,12 @@ def : Pat < def : Pat < (f32 (sint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) + (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src) >; def : Pat < (f32 (uint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src) + (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src) >; def : Pat < @@ -3454,25 +993,25 @@ def : Pat < def : Pat < (i1 (trunc i32:$a)), - (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), $a), 1) + (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) >; def : Pat < - (i1 (trunc i64:$a)), - (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), - (EXTRACT_SUBREG $a, sub0)), 1) + (i1 (trunc i16:$a)), + (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) >; def : Pat < - (i32 (bswap i32:$a)), - (V_BFI_B32 (S_MOV_B32 0x00ff00ff), - (V_ALIGNBIT_B32 $a, $a, 24), - (V_ALIGNBIT_B32 $a, $a, 8)) + (i1 (trunc i64:$a)), + (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), + (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; def : Pat < - (f32 (select i1:$src2, f32:$src1, f32:$src0)), - (V_CNDMASK_B32_e64 $src0, $src1, $src2) + (i32 (bswap i32:$a)), + (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), + (V_ALIGNBIT_B32 $a, $a, (i32 24)), + (V_ALIGNBIT_B32 $a, $a, (i32 8))) >; multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { @@ -3483,7 +1022,7 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { def : Pat < (vt (add (vt (shl 1, vt:$a)), -1)), - (BFM $a, (MOV 0)) + (BFM $a, (MOV (i32 0))) >; } @@ -3492,16 +1031,14 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; def : BFEPattern <V_BFE_U32, S_MOV_B32>; -let Predicates = [isSICI] in { -def : Pat < - (i64 (readcyclecounter)), - (S_MEMTIME) +def : Pat< + (fcanonicalize f16:$src), + (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0) >; -} def : Pat< (fcanonicalize f32:$src), - (V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0) + (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0) >; def : Pat< @@ -3536,7 +1073,7 @@ def : Pat < (V_MOV_B64_PSEUDO 0x3fefffffffffffff), DSTCLAMP.NONE, DSTOMOD.NONE), $x, - (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))), DSTCLAMP.NONE, DSTOMOD.NONE) >; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td index 9d06ccf..5da3754 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td @@ -15,7 +15,20 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; + + def int_SI_export : Intrinsic <[], + [llvm_i32_ty, // en + llvm_i32_ty, // vm (FIXME: should be i1) + llvm_i32_ty, // done (FIXME: should be i1) + llvm_i32_ty, // tgt + llvm_i32_ty, // compr (FIXME: should be i1) + llvm_float_ty, // src0 + llvm_float_ty, // src1 + llvm_float_ty, // src2 + llvm_float_ty], // src3 + [] + >; + def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; @@ -186,11 +199,11 @@ let TargetPrefix = "amdgcn", isTarget = 1 in { /* Control flow Intrinsics */ - def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; - def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; - def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; - def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; - def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; - def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; - def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], []>; + def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], [IntrConvergent]>; + def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>; + def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent]>; + def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>; + def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>; + def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>; + def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 9e972a5..99fe96c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -60,31 +60,35 @@ private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; - LiveIntervals *LIS; + AliasAnalysis *AA; static bool offsetsCanBeCombined(unsigned Offset0, unsigned Offset1, unsigned EltSize); - MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize); + MachineBasicBlock::iterator findMatchingDSInst( + MachineBasicBlock::iterator I, + unsigned EltSize, + SmallVectorImpl<MachineInstr*> &InstsToMove); MachineBasicBlock::iterator mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize); + unsigned EltSize, + ArrayRef<MachineInstr*> InstsToMove); MachineBasicBlock::iterator mergeWrite2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize); + unsigned EltSize, + ArrayRef<MachineInstr*> InstsToMove); public: static char ID; SILoadStoreOptimizer() : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), - LIS(nullptr) {} + AA(nullptr) {} SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); @@ -94,16 +98,11 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { - return "SI Load / Store Optimizer"; - } + StringRef getPassName() const override { return "SI Load / Store Optimizer"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addPreserved<SlotIndexes>(); - AU.addPreserved<LiveIntervals>(); - AU.addPreserved<LiveVariables>(); - AU.addRequired<LiveIntervals>(); + AU.addRequired<AAResultsWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -113,9 +112,7 @@ public: INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load / Store Optimizer", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(LiveVariables) -INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load / Store Optimizer", false, false) @@ -127,6 +124,73 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { return new SILoadStoreOptimizer(TM); } +static void moveInstsAfter(MachineBasicBlock::iterator I, + ArrayRef<MachineInstr*> InstsToMove) { + MachineBasicBlock *MBB = I->getParent(); + ++I; + for (MachineInstr *MI : InstsToMove) { + MI->removeFromParent(); + MBB->insert(I, MI); + } +} + +static void addDefsToList(const MachineInstr &MI, + SmallVectorImpl<const MachineOperand *> &Defs) { + for (const MachineOperand &Def : MI.defs()) { + Defs.push_back(&Def); + } +} + +static bool memAccessesCanBeReordered( + MachineBasicBlock::iterator A, + MachineBasicBlock::iterator B, + const SIInstrInfo *TII, + llvm::AliasAnalysis * AA) { + return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) || + // RAW or WAR - cannot reorder + // WAW - cannot reorder + // RAR - safe to reorder + !(A->mayStore() || B->mayStore())); +} + +// Add MI and its defs to the lists if MI reads one of the defs that are +// already in the list. Returns true in that case. +static bool +addToListsIfDependent(MachineInstr &MI, + SmallVectorImpl<const MachineOperand *> &Defs, + SmallVectorImpl<MachineInstr*> &Insts) { + for (const MachineOperand *Def : Defs) { + bool ReadDef = MI.readsVirtualRegister(Def->getReg()); + // If ReadDef is true, then there is a use of Def between I + // and the instruction that I will potentially be merged with. We + // will need to move this instruction after the merged instructions. + if (ReadDef) { + Insts.push_back(&MI); + addDefsToList(MI, Defs); + return true; + } + } + + return false; +} + +static bool +canMoveInstsAcrossMemOp(MachineInstr &MemOp, + ArrayRef<MachineInstr*> InstsToMove, + const SIInstrInfo *TII, + AliasAnalysis *AA) { + + assert(MemOp.mayLoadOrStore()); + + for (MachineInstr *InstToMove : InstsToMove) { + if (!InstToMove->mayLoadOrStore()) + continue; + if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) + return false; + } + return true; +} + bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, unsigned Offset1, unsigned Size) { @@ -156,43 +220,99 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, MachineBasicBlock::iterator SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize){ + unsigned EltSize, + SmallVectorImpl<MachineInstr*> &InstsToMove) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; ++MBBI; - if (MBBI->getOpcode() != I->getOpcode()) - return E; - - // Don't merge volatiles. - if (MBBI->hasOrderedMemoryRef()) - return E; - - int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); - const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); - const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); - - // Check same base pointer. Be careful of subregisters, which can occur with - // vectors of pointers. - if (AddrReg0.getReg() == AddrReg1.getReg() && - AddrReg0.getSubReg() == AddrReg1.getSubReg()) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), - AMDGPU::OpName::offset); - unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; - unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; - - // Check both offsets fit in the reduced range. - if (offsetsCanBeCombined(Offset0, Offset1, EltSize)) - return MBBI; - } + SmallVector<const MachineOperand *, 8> DefsToMove; + addDefsToList(*I, DefsToMove); + for ( ; MBBI != E; ++MBBI) { + + if (MBBI->getOpcode() != I->getOpcode()) { + + // This is not a matching DS instruction, but we can keep looking as + // long as one of these conditions are met: + // 1. It is safe to move I down past MBBI. + // 2. It is safe to move MBBI down past the instruction that I will + // be merged into. + + if (MBBI->hasUnmodeledSideEffects()) + // We can't re-order this instruction with respect to other memory + // opeations, so we fail both conditions mentioned above. + return E; + + if (MBBI->mayLoadOrStore() && + !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) { + // We fail condition #1, but we may still be able to satisfy condition + // #2. Add this instruction to the move list and then we will check + // if condition #2 holds once we have selected the matching instruction. + InstsToMove.push_back(&*MBBI); + addDefsToList(*MBBI, DefsToMove); + continue; + } + + // When we match I with another DS instruction we will be moving I down + // to the location of the matched instruction any uses of I will need to + // be moved down as well. + addToListsIfDependent(*MBBI, DefsToMove, InstsToMove); + continue; + } + + // Don't merge volatiles. + if (MBBI->hasOrderedMemoryRef()) + return E; + + // Handle a case like + // DS_WRITE_B32 addr, v, idx0 + // w = DS_READ_B32 addr, idx0 + // DS_WRITE_B32 addr, f(w), idx1 + // where the DS_READ_B32 ends up in InstsToMove and therefore prevents + // merging of the two writes. + if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove)) + continue; + + int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); + const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); + const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); + + // Check same base pointer. Be careful of subregisters, which can occur with + // vectors of pointers. + if (AddrReg0.getReg() == AddrReg1.getReg() && + AddrReg0.getSubReg() == AddrReg1.getSubReg()) { + int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), + AMDGPU::OpName::offset); + unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; + unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + + // Check both offsets fit in the reduced range. + // We also need to go through the list of instructions that we plan to + // move and make sure they are all safe to move down past the merged + // instruction. + if (offsetsCanBeCombined(Offset0, Offset1, EltSize) && + canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA)) + return MBBI; + } + + // We've found a load/store that we couldn't merge for some reason. + // We could potentially keep looking, but we'd need to make sure that + // it was safe to move I and also all the instruction in InstsToMove + // down past this instruction. + if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) || // check if we can move I across MBBI + !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users + ) + break; + } return E; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize) { + unsigned EltSize, + ArrayRef<MachineInstr*> InstsToMove) { MachineBasicBlock *MBB = I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -220,6 +340,15 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; } + unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; + unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; + + if (NewOffset0 > NewOffset1) { + // Canonicalize the merged instruction so the smaller offset comes first. + std::swap(NewOffset0, NewOffset1); + std::swap(SubRegIdx0, SubRegIdx1); + } + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); @@ -232,62 +361,40 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( DebugLoc DL = I->getDebugLoc(); MachineInstrBuilder Read2 - = BuildMI(*MBB, I, DL, Read2Desc, DestReg) + = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg) .addOperand(*AddrReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 .addImm(0) // gds .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); - - unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; - unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; + (void)Read2; const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); // Copy to the old destination registers. - MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc) + BuildMI(*MBB, Paired, DL, CopyDesc) .addOperand(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc) + MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc) .addOperand(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - LIS->InsertMachineInstrInMaps(*Read2); - - // repairLiveintervalsInRange() doesn't handle physical register, so we have - // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); - LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); - LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); - bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); - - // The new write to the original destination register is now the copy. Steal - // the old SlotIndex. - LIS->ReplaceMachineInstrInMaps(*I, *Copy0); - LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1); + moveInstsAfter(Copy1, InstsToMove); + MachineBasicBlock::iterator Next = std::next(I); I->eraseFromParent(); Paired->eraseFromParent(); - LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); - LIS->shrinkToUses(&AddrRegLI); - - LIS->createAndComputeVirtRegInterval(DestReg); - - if (UpdateM0Range) { - SlotIndex Read2Index = LIS->getInstructionIndex(*Read2); - M0Segment->end = Read2Index.getRegSlot(); - } - DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); - return Read2.getInstr(); + return Next; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize) { + unsigned EltSize, + ArrayRef<MachineInstr*> InstsToMove) { MachineBasicBlock *MBB = I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be @@ -316,6 +423,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; } + if (NewOffset0 > NewOffset1) { + // Canonicalize the merged instruction so the smaller offset comes first. + std::swap(NewOffset0, NewOffset1); + std::swap(Data0, Data1); + } + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); @@ -323,15 +436,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = I->getDebugLoc(); - // repairLiveintervalsInRange() doesn't handle physical register, so we have - // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); - LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); - LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); - bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); - MachineInstrBuilder Write2 - = BuildMI(*MBB, I, DL, Write2Desc) + = BuildMI(*MBB, Paired, DL, Write2Desc) .addOperand(*Addr) // addr .addOperand(*Data0) // data0 .addOperand(*Data1) // data1 @@ -341,24 +447,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); - // XXX - How do we express subregisters here? - unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; + moveInstsAfter(Write2, InstsToMove); - LIS->RemoveMachineInstrFromMaps(*I); - LIS->RemoveMachineInstrFromMaps(*Paired); + MachineBasicBlock::iterator Next = std::next(I); I->eraseFromParent(); Paired->eraseFromParent(); - // This doesn't handle physical registers like M0 - LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); - - if (UpdateM0Range) { - SlotIndex Write2Index = LIS->getInstructionIndex(*Write2); - M0Segment->end = Write2Index.getRegSlot(); - } - DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); - return Write2.getInstr(); + return Next; } // Scan through looking for adjacent LDS operations with constant offsets from @@ -376,13 +472,15 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } + SmallVector<MachineInstr*, 8> InstsToMove; unsigned Opc = MI.getOpcode(); if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, + InstsToMove); if (Match != E) { Modified = true; - I = mergeRead2Pair(I, Match, Size); + I = mergeRead2Pair(I, Match, Size, InstsToMove); } else { ++I; } @@ -390,10 +488,11 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, + InstsToMove); if (Match != E) { Modified = true; - I = mergeWrite2Pair(I, Match, Size); + I = mergeWrite2Pair(I, Match, Size, InstsToMove); } else { ++I; } @@ -419,13 +518,10 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); - - LIS = &getAnalysis<LiveIntervals>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); - assert(!MRI->isSSA()); - bool Modified = false; for (MachineBasicBlock &MBB : MF) diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index ee1d5da..7ed18f2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -58,7 +58,6 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Constants.h" using namespace llvm; @@ -68,63 +67,50 @@ namespace { class SILowerControlFlow : public MachineFunctionPass { private: - static const unsigned SkipThreshold = 12; - const SIRegisterInfo *TRI; const SIInstrInfo *TII; + LiveIntervals *LIS; + MachineRegisterInfo *MRI; - bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); - - void Skip(MachineInstr &From, MachineOperand &To); - bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); - - void If(MachineInstr &MI); - void Else(MachineInstr &MI, bool ExecModified); - void Break(MachineInstr &MI); - void IfBreak(MachineInstr &MI); - void ElseBreak(MachineInstr &MI); - void Loop(MachineInstr &MI); - void EndCf(MachineInstr &MI); - - void Kill(MachineInstr &MI); - void Branch(MachineInstr &MI); - - MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const; - - std::pair<MachineBasicBlock *, MachineBasicBlock *> - splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + void emitIf(MachineInstr &MI); + void emitElse(MachineInstr &MI); + void emitBreak(MachineInstr &MI); + void emitIfBreak(MachineInstr &MI); + void emitElseBreak(MachineInstr &MI); + void emitLoop(MachineInstr &MI); + void emitEndCf(MachineInstr &MI); - void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, - const MachineRegisterInfo &MRI, - const MachineInstr &MI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &RemainderBB, - unsigned SaveReg, - const MachineOperand &IdxReg); + void findMaskOperands(MachineInstr &MI, unsigned OpNo, + SmallVectorImpl<MachineOperand> &Src) const; - void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, - MachineInstr *MovRel, - const MachineOperand &IdxReg, - int Offset); - - bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); - std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg, - int Offset) const; - bool indirectSrc(MachineInstr &MI); - bool indirectDst(MachineInstr &MI); + void combineMasks(MachineInstr &MI); public: static char ID; SILowerControlFlow() : - MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } + MachineFunctionPass(ID), + TRI(nullptr), + TII(nullptr), + LIS(nullptr), + MRI(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { + StringRef getPassName() const override { return "SI Lower control flow pseudo instructions"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // Should preserve the same set that TwoAddressInstructions does. + AU.addPreserved<SlotIndexes>(); + AU.addPreserved<LiveIntervals>(); + AU.addPreservedID(LiveVariablesID); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace @@ -132,555 +118,283 @@ public: char SILowerControlFlow::ID = 0; INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, - "SI lower control flow", false, false) + "SI lower control flow", false, false) -char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID; +static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { + MachineOperand &ImpDefSCC = MI.getOperand(3); + assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); - -FunctionPass *llvm::createSILowerControlFlowPass() { - return new SILowerControlFlow(); + ImpDefSCC.setIsDead(IsDead); } -static bool opcodeEmitsNoInsts(unsigned Opc) { - switch (Opc) { - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - case TargetOpcode::BUNDLE: - case TargetOpcode::CFI_INSTRUCTION: - case TargetOpcode::EH_LABEL: - case TargetOpcode::GC_LABEL: - case TargetOpcode::DBG_VALUE: - return true; - default: - return false; - } -} - -bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From, - MachineBasicBlock *To) { - if (From->succ_empty()) - return false; - - unsigned NumInstr = 0; - MachineFunction *MF = From->getParent(); - - for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end(); - MBBI != End && MBBI != ToI; ++MBBI) { - MachineBasicBlock &MBB = *MBBI; - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - NumInstr < SkipThreshold && I != E; ++I) { - if (opcodeEmitsNoInsts(I->getOpcode())) - continue; - - // When a uniform loop is inside non-uniform control flow, the branch - // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken - // when EXEC = 0. We should skip the loop lest it becomes infinite. - if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || - I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) - return true; - - if (I->isInlineAsm()) { - const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); - const char *AsmStr = I->getOperand(0).getSymbolName(); - - // inlineasm length estimate is number of bytes assuming the longest - // instruction. - uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); - NumInstr += MaxAsmSize / MAI->getMaxInstLength(); - } else { - ++NumInstr; - } +char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; - if (NumInstr >= SkipThreshold) - return true; - } - } - - return false; -} - -void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) { - - if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) - return; - - DebugLoc DL = From.getDebugLoc(); - BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addOperand(To); -} - -bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { +void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - MachineFunction *MF = MBB.getParent(); - - if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || - !shouldSkip(&MBB, &MBB.getParent()->back())) - return false; - - MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); - MBB.addSuccessor(SkipBB); - const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(&MI); - // If the exec mask is non-zero, skip the next two instructions - BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addMBB(&NextBB); - - MachineBasicBlock::iterator Insert = SkipBB->begin(); - - // Exec mask is zero: Export to NULL target... - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef); - - // ... and terminate wavefront. - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); - - return true; -} - -void SILowerControlFlow::If(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Reg = MI.getOperand(0).getReg(); - unsigned Vcc = MI.getOperand(1).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) - .addReg(Vcc); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) - .addReg(AMDGPU::EXEC) - .addReg(Reg); - - Skip(MI, MI.getOperand(2)); + MachineOperand &SaveExec = MI.getOperand(0); + MachineOperand &Cond = MI.getOperand(1); + assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister && + Cond.getSubReg() == AMDGPU::NoSubRegister); - // Insert a pseudo terminator to help keep the verifier happy. - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) - .addOperand(MI.getOperand(2)) - .addReg(Reg); + unsigned SaveExecReg = SaveExec.getReg(); - MI.eraseFromParent(); -} + MachineOperand &ImpDefSCC = MI.getOperand(4); + assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); -void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); - - BuildMI(MBB, MBB.getFirstNonPHI(), DL, - TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) - .addReg(Src); // Saved EXEC - - if (ExecModified) { - // Adjust the saved exec to account for the modifications during the flow - // block that contains the ELSE. This can happen when WQM mode is switched - // off. - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) - .addReg(AMDGPU::EXEC) - .addReg(Dst); + // Add an implicit def of exec to discourage scheduling VALU after this which + // will interfere with trying to form s_and_saveexec_b64 later. + unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstr *CopyExec = + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::EXEC, RegState::ImplicitDefine); + + unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + + MachineInstr *And = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp) + .addReg(CopyReg) + //.addReg(AMDGPU::EXEC) + .addReg(Cond.getReg()); + setImpSCCDefDead(*And, true); + + MachineInstr *Xor = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) + .addReg(Tmp) + .addReg(CopyReg); + setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); + + // Use a copy that is a terminator to get correct spill code placement it with + // fast regalloc. + MachineInstr *SetExec = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC) + .addReg(Tmp, RegState::Kill); + + // Insert a pseudo terminator to help keep the verifier happy. This will also + // be used later when inserting skips. + MachineInstr *NewBr = + BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .addOperand(MI.getOperand(2)); + + if (!LIS) { + MI.eraseFromParent(); + return; } - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Dst); + LIS->InsertMachineInstrInMaps(*CopyExec); - Skip(MI, MI.getOperand(2)); + // Replace with and so we don't need to fix the live interval for condition + // register. + LIS->ReplaceMachineInstrInMaps(MI, *And); - // Insert a pseudo terminator to help keep the verifier happy. - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) - .addOperand(MI.getOperand(2)) - .addReg(Dst); + LIS->InsertMachineInstrInMaps(*Xor); + LIS->InsertMachineInstrInMaps(*SetExec); + LIS->InsertMachineInstrInMaps(*NewBr); + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); MI.eraseFromParent(); -} - -void SILowerControlFlow::Break(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(AMDGPU::EXEC) - .addReg(Src); - - MI.eraseFromParent(); + // FIXME: Is there a better way of adjusting the liveness? It shouldn't be + // hard to add another def here but I'm not sure how to correctly update the + // valno. + LIS->removeInterval(SaveExecReg); + LIS->createAndComputeVirtRegInterval(SaveExecReg); + LIS->createAndComputeVirtRegInterval(Tmp); + LIS->createAndComputeVirtRegInterval(CopyReg); } -void SILowerControlFlow::IfBreak(MachineInstr &MI) { +void SILowerControlFlow::emitElse(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Vcc = MI.getOperand(1).getReg(); - unsigned Src = MI.getOperand(2).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(Vcc) - .addReg(Src); - - MI.eraseFromParent(); -} + const DebugLoc &DL = MI.getDebugLoc(); -void SILowerControlFlow::ElseBreak(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + unsigned DstReg = MI.getOperand(0).getReg(); + assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Saved = MI.getOperand(1).getReg(); - unsigned Src = MI.getOperand(2).getReg(); + bool ExecModified = MI.getOperand(3).getImm() != 0; + MachineBasicBlock::iterator Start = MBB.begin(); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(Saved) - .addReg(Src); + // We are running before TwoAddressInstructions, and si_else's operands are + // tied. In order to correctly tie the registers, split this into a copy of + // the src like it does. + unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg) + .addOperand(MI.getOperand(1)); // Saved EXEC - MI.eraseFromParent(); -} + // This must be inserted before phis and any spill code inserted before the + // else. + unsigned SaveReg = ExecModified ? + MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass) : DstReg; + MachineInstr *OrSaveExec = + BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), SaveReg) + .addReg(CopyReg); -void SILowerControlFlow::Loop(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Src = MI.getOperand(0).getReg(); + MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Src); + MachineBasicBlock::iterator ElsePt(MI); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addOperand(MI.getOperand(1)); + if (ExecModified) { + MachineInstr *And = + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg) + .addReg(AMDGPU::EXEC) + .addReg(SaveReg); - MI.eraseFromParent(); -} + if (LIS) + LIS->InsertMachineInstrInMaps(*And); + } -void SILowerControlFlow::EndCf(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Reg = MI.getOperand(0).getReg(); + MachineInstr *Xor = + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(DstReg); - BuildMI(MBB, MBB.getFirstNonPHI(), DL, - TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Reg); + MachineInstr *Branch = + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .addMBB(DestBB); - MI.eraseFromParent(); -} - -void SILowerControlFlow::Branch(MachineInstr &MI) { - MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); - if (MBB == MI.getParent()->getNextNode()) + if (!LIS) { MI.eraseFromParent(); - - // If these aren't equal, this is probably an infinite loop. -} - -void SILowerControlFlow::Kill(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - const MachineOperand &Op = MI.getOperand(0); - -#ifndef NDEBUG - CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); - // Kill is only allowed in pixel / geometry shaders. - assert(CallConv == CallingConv::AMDGPU_PS || - CallConv == CallingConv::AMDGPU_GS); -#endif - - // Clear this thread from the exec mask if the operand is negative - if ((Op.isImm())) { - // Constant operand: Set exec mask to 0 or do nothing - if (Op.getImm() & 0x80000000) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addImm(0); - } - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) - .addImm(0) - .addOperand(Op); + return; } + LIS->RemoveMachineInstrFromMaps(MI); MI.eraseFromParent(); -} -// All currently live registers must remain so in the remainder block. -void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, - const MachineRegisterInfo &MRI, - const MachineInstr &MI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &RemainderBB, - unsigned SaveReg, - const MachineOperand &IdxReg) { - // Add reg defined in loop body. - RemainderLiveRegs.addReg(SaveReg); - - if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) { - if (!Val->isUndef()) { - RemainderLiveRegs.addReg(Val->getReg()); - LoopBB.addLiveIn(Val->getReg()); - } - } + LIS->InsertMachineInstrInMaps(*OrSaveExec); - for (unsigned Reg : RemainderLiveRegs) { - if (MRI.isAllocatable(Reg)) - RemainderBB.addLiveIn(Reg); - } + LIS->InsertMachineInstrInMaps(*Xor); + LIS->InsertMachineInstrInMaps(*Branch); - const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src); - if (!Src->isUndef()) - LoopBB.addLiveIn(Src->getReg()); + // src reg is tied to dst reg. + LIS->removeInterval(DstReg); + LIS->createAndComputeVirtRegInterval(DstReg); + LIS->createAndComputeVirtRegInterval(CopyReg); + if (ExecModified) + LIS->createAndComputeVirtRegInterval(SaveReg); - if (!IdxReg.isUndef()) - LoopBB.addLiveIn(IdxReg.getReg()); - LoopBB.sortUniqueLiveIns(); + // Let this be recomputed. + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); } -void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, - DebugLoc DL, - MachineInstr *MovRel, - const MachineOperand &IdxReg, - int Offset) { - MachineBasicBlock::iterator I = LoopBB.begin(); - - // Read the next variant into VCC (lower 32 bits) <- also loop target - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO) - .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); - - // Move index from VCC into M0 - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(AMDGPU::VCC_LO); - - // Compare the just read M0 value to all possible Idx values - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) - .addReg(AMDGPU::M0) - .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); - - // Update EXEC, save the original EXEC value to VCC - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) - .addReg(AMDGPU::VCC); - - if (Offset != 0) { - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(AMDGPU::M0) - .addImm(Offset); - } - - // Do the actual move - LoopBB.insert(I, MovRel); +void SILowerControlFlow::emitBreak(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Dst = MI.getOperand(0).getReg(); - // Update EXEC, switch all done bits to 0 and all todo bits to 1 - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + MachineInstr *Or = + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); + .addOperand(MI.getOperand(1)); - // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addMBB(&LoopBB); + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *Or); + MI.eraseFromParent(); } -MachineBasicBlock *SILowerControlFlow::insertSkipBlock( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - MachineFunction *MF = MBB.getParent(); - - MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; - - MF->insert(MBBI, SkipBB); - - return SkipBB; +void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { + MI.setDesc(TII->get(AMDGPU::S_OR_B64)); } -std::pair<MachineBasicBlock *, MachineBasicBlock *> -SILowerControlFlow::splitBlock(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { - MachineFunction *MF = MBB.getParent(); - - // To insert the loop we need to split the block. Move everything after this - // point to a new block, and insert a new empty block between the two. - MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); - MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; - - MF->insert(MBBI, LoopBB); - MF->insert(MBBI, RemainderBB); - - // Move the rest of the block into a new block. - RemainderBB->transferSuccessors(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - - MBB.addSuccessor(LoopBB); - - return std::make_pair(LoopBB, RemainderBB); +void SILowerControlFlow::emitElseBreak(MachineInstr &MI) { + MI.setDesc(TII->get(AMDGPU::S_OR_B64)); } -// Returns true if a new block was inserted. -bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { +void SILowerControlFlow::emitLoop(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - MachineBasicBlock::iterator I(&MI); + const DebugLoc &DL = MI.getDebugLoc(); - const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + MachineInstr *AndN2 = + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addOperand(MI.getOperand(0)); - if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) { - if (Offset != 0) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())) - .addImm(Offset); - } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())); - } + MachineInstr *Branch = + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addOperand(MI.getOperand(1)); - MBB.insert(I, MovRel); - MI.eraseFromParent(); - return false; + if (LIS) { + LIS->ReplaceMachineInstrInMaps(MI, *AndN2); + LIS->InsertMachineInstrInMaps(*Branch); } - MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); - SaveOp->setIsDead(false); - unsigned Save = SaveOp->getReg(); - - // Reading from a VGPR requires looping over all workitems in the wavefront. - assert(AMDGPU::SReg_64RegClass.contains(Save) && - AMDGPU::VGPR_32RegClass.contains(Idx->getReg())); - - // Save the EXEC mask - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) - .addReg(AMDGPU::EXEC); - - LivePhysRegs RemainderLiveRegs(TRI); - - RemainderLiveRegs.addLiveOuts(MBB); - - MachineBasicBlock *LoopBB; - MachineBasicBlock *RemainderBB; - - std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I); - - for (const MachineInstr &Inst : reverse(*RemainderBB)) - RemainderLiveRegs.stepBackward(Inst); - - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - LoopBB->addSuccessor(RemainderBB); - LoopBB->addSuccessor(LoopBB); - - splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB, - *RemainderBB, Save, *Idx); - - emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset); - - MachineBasicBlock::iterator First = RemainderBB->begin(); - BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(Save); - MI.eraseFromParent(); - return true; -} - -/// \param @VecReg The register which holds element zero of the vector being -/// addressed into. -// -/// \param[in] @Idx The index operand from the movrel instruction. This must be -// a register, but may be NoRegister. -/// -/// \param[in] @Offset As an input, this is the constant offset part of the -// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant -// value that needs to be added to the value stored in M0. -std::pair<unsigned, int> -SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const { - unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); - if (!SubReg) - SubReg = VecReg; - - const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg); - const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); - int NumElts = SuperRC->getSize() / RC->getSize(); - - int BaseRegIdx = TRI->getHWRegIndex(SubReg); - - // Skip out of bounds offsets, or else we would end up using an undefined - // register. - if (Offset >= NumElts) - return std::make_pair(RC->getRegister(BaseRegIdx), Offset); - - int RegIdx = BaseRegIdx + Offset; - if (RegIdx < 0) { - Offset = RegIdx; - RegIdx = 0; - } else { - Offset = 0; - } - - unsigned Reg = RC->getRegister(RegIdx); - return std::make_pair(Reg, Offset); } -// Return true if a new block was inserted. -bool SILowerControlFlow::indirectSrc(MachineInstr &MI) { +void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - unsigned Dst = MI.getOperand(0).getReg(); - const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); - int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); - unsigned Reg; - - std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset); + MachineBasicBlock::iterator InsPt = MBB.begin(); + MachineInstr *NewMI = + BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addOperand(MI.getOperand(0)); - const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); - if (Idx->getReg() == AMDGPU::NoRegister) { - // Only had a constant offset, copy the register directly. - BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) - .addReg(Reg, getUndefRegState(SrcVec->isUndef())); - MI.eraseFromParent(); - return false; - } + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *NewMI); - MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(Reg, getUndefRegState(SrcVec->isUndef())) - .addReg(SrcVec->getReg(), RegState::Implicit); + MI.eraseFromParent(); - return loadM0(MI, MovRel, Offset); + if (LIS) + LIS->handleMove(*NewMI); } -// Return true if a new block was inserted. -bool SILowerControlFlow::indirectDst(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); - unsigned Reg; - - const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); - std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset); - - MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); - if (Idx->getReg() == AMDGPU::NoRegister) { - // Only had a constant offset, copy the register directly. - BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg) - .addOperand(*Val); - MI.eraseFromParent(); - return false; +// Returns replace operands for a logical operation, either single result +// for exec or two operands if source was another equivalent operation. +void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, + SmallVectorImpl<MachineOperand> &Src) const { + MachineOperand &Op = MI.getOperand(OpNo); + if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) { + Src.push_back(Op); + return; } - MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg) - .addReg(Val->getReg(), getUndefRegState(Val->isUndef())) - .addReg(Dst, RegState::Implicit); + MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); + if (!Def || Def->getParent() != MI.getParent() || + !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode()))) + return; - return loadM0(MI, MovRel, Offset); + // Make sure we do not modify exec between def and use. + // A copy with implcitly defined exec inserted earlier is an exclusion, it + // does not really modify exec. + for (auto I = Def->getIterator(); I != MI.getIterator(); ++I) + if (I->modifiesRegister(AMDGPU::EXEC, TRI) && + !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC)) + return; + + for (const auto &SrcOp : Def->explicit_operands()) + if (SrcOp.isUse() && (!SrcOp.isReg() || + TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) || + SrcOp.getReg() == AMDGPU::EXEC)) + Src.push_back(SrcOp); +} + +// Search and combine pairs of equivalent instructions, like +// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y +// S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y +// One of the operands is exec mask. +void SILowerControlFlow::combineMasks(MachineInstr &MI) { + assert(MI.getNumExplicitOperands() == 3); + SmallVector<MachineOperand, 4> Ops; + unsigned OpToReplace = 1; + findMaskOperands(MI, 1, Ops); + if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy + findMaskOperands(MI, 2, Ops); + if (Ops.size() != 3) return; + + unsigned UniqueOpndIdx; + if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2; + else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1; + else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1; + else return; + + unsigned Reg = MI.getOperand(OpToReplace).getReg(); + MI.RemoveOperand(OpToReplace); + MI.addOperand(Ops[UniqueOpndIdx]); + if (MRI->use_empty(Reg)) + MRI->getUniqueVRegDef(Reg)->eraseFromParent(); } bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { @@ -688,148 +402,66 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - - bool HaveKill = false; - bool NeedFlat = false; - unsigned Depth = 0; + // This doesn't actually need LiveIntervals, but we can preserve them. + LIS = getAnalysisIfAvailable<LiveIntervals>(); + MRI = &MF.getRegInfo(); MachineFunction::iterator NextBB; - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; BI = NextBB) { NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; - MachineBasicBlock *EmptyMBBAtEnd = nullptr; - MachineBasicBlock::iterator I, Next; - bool ExecModified = false; + MachineBasicBlock::iterator I, Next, Last; - for (I = MBB.begin(); I != MBB.end(); I = Next) { + for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) { Next = std::next(I); - MachineInstr &MI = *I; - // Flat uses m0 in case it needs to access LDS. - if (TII->isFLAT(MI)) - NeedFlat = true; - - if (I->modifiesRegister(AMDGPU::EXEC, TRI)) - ExecModified = true; - switch (MI.getOpcode()) { - default: break; - case AMDGPU::SI_IF: - ++Depth; - If(MI); - break; - - case AMDGPU::SI_ELSE: - Else(MI, ExecModified); - break; - - case AMDGPU::SI_BREAK: - Break(MI); - break; - - case AMDGPU::SI_IF_BREAK: - IfBreak(MI); - break; - - case AMDGPU::SI_ELSE_BREAK: - ElseBreak(MI); - break; - - case AMDGPU::SI_LOOP: - ++Depth; - Loop(MI); - break; - - case AMDGPU::SI_END_CF: - if (--Depth == 0 && HaveKill) { - HaveKill = false; - // TODO: Insert skip if exec is 0? - } - - EndCf(MI); - break; - - case AMDGPU::SI_KILL_TERMINATOR: - if (Depth == 0) { - if (skipIfDead(MI, *NextBB)) { - NextBB = std::next(BI); - BE = MF.end(); - } - } else - HaveKill = true; - Kill(MI); - break; - - case AMDGPU::S_BRANCH: - Branch(MI); - break; - - case AMDGPU::SI_INDIRECT_SRC_V1: - case AMDGPU::SI_INDIRECT_SRC_V2: - case AMDGPU::SI_INDIRECT_SRC_V4: - case AMDGPU::SI_INDIRECT_SRC_V8: - case AMDGPU::SI_INDIRECT_SRC_V16: - if (indirectSrc(MI)) { - // The block was split at this point. We can safely skip the middle - // inserted block to the following which contains the rest of this - // block's instructions. - NextBB = std::next(BI); - BE = MF.end(); - Next = MBB.end(); - } - - break; - - case AMDGPU::SI_INDIRECT_DST_V1: - case AMDGPU::SI_INDIRECT_DST_V2: - case AMDGPU::SI_INDIRECT_DST_V4: - case AMDGPU::SI_INDIRECT_DST_V8: - case AMDGPU::SI_INDIRECT_DST_V16: - if (indirectDst(MI)) { - // The block was split at this point. We can safely skip the middle - // inserted block to the following which contains the rest of this - // block's instructions. - NextBB = std::next(BI); - BE = MF.end(); - Next = MBB.end(); - } - - break; - - case AMDGPU::SI_RETURN: { - assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); - - // Graphics shaders returning non-void shouldn't contain S_ENDPGM, - // because external bytecode will be appended at the end. - if (BI != --MF.end() || I != MBB.getFirstTerminator()) { - // SI_RETURN is not the last instruction. Add an empty block at - // the end and jump there. - if (!EmptyMBBAtEnd) { - EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); - MF.insert(MF.end(), EmptyMBBAtEnd); - } - - MBB.addSuccessor(EmptyMBBAtEnd); - BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) - .addMBB(EmptyMBBAtEnd); - I->eraseFromParent(); - } - break; - } + case AMDGPU::SI_IF: + emitIf(MI); + break; + + case AMDGPU::SI_ELSE: + emitElse(MI); + break; + + case AMDGPU::SI_BREAK: + emitBreak(MI); + break; + + case AMDGPU::SI_IF_BREAK: + emitIfBreak(MI); + break; + + case AMDGPU::SI_ELSE_BREAK: + emitElseBreak(MI); + break; + + case AMDGPU::SI_LOOP: + emitLoop(MI); + break; + + case AMDGPU::SI_END_CF: + emitEndCf(MI); + break; + + case AMDGPU::S_AND_B64: + case AMDGPU::S_OR_B64: + // Cleanup bit manipulations on exec mask + combineMasks(MI); + Last = I; + continue; + + default: + Last = I; + continue; } - } - } - if (NeedFlat && MFI->IsKernel) { - // TODO: What to use with function calls? - // We will need to Initialize the flat scratch register pair. - if (NeedFlat) - MFI->setHasFlatInstructions(true); + // Replay newly inserted code to combine masks + Next = (Last == MBB.end()) ? MBB.begin() : Last; + } } return true; diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index dc1d20d..be2e14f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -41,9 +41,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { - return "SI Lower i1 Copies"; - } + StringRef getPassName() const override { return "SI Lower i1 Copies"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -102,12 +100,12 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg()); const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg()); + DebugLoc DL = MI.getDebugLoc(); + MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); if (DstRC == &AMDGPU::VReg_1RegClass && TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { I1Defs.push_back(Dst.getReg()); - DebugLoc DL = MI.getDebugLoc(); - MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) { if (DefInst->getOperand(1).isImm()) { I1Defs.push_back(Dst.getReg()); @@ -131,10 +129,26 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { MI.eraseFromParent(); } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && SrcRC == &AMDGPU::VReg_1RegClass) { - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64)) - .addOperand(Dst) - .addOperand(Src) - .addImm(0); + if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 && + DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() && + DefInst->getOperand(1).getImm() == 0 && + DefInst->getOperand(2).getImm() != 0 && + DefInst->getOperand(3).isReg() && + TargetRegisterInfo::isVirtualRegister( + DefInst->getOperand(3).getReg()) && + TRI->getCommonSubClass( + MRI.getRegClass(DefInst->getOperand(3).getReg()), + &AMDGPU::SGPR_64RegClass)) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64)) + .addOperand(Dst) + .addReg(AMDGPU::EXEC) + .addOperand(DefInst->getOperand(3)); + } else { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64)) + .addOperand(Dst) + .addOperand(Src) + .addImm(0); + } MI.eraseFromParent(); } } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 848be32..ecd46b9 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -26,9 +26,6 @@ static cl::opt<bool> EnableSpillSGPRToVGPR( cl::ReallyHidden, cl::init(true)); -// Pin the vtable to this file. -void SIMachineFunctionInfo::anchor() {} - SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), @@ -51,8 +48,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), PSInputAddr(0), ReturnsVoid(true), - MaximumWorkGroupSize(0), - DebuggerReservedVGPRCount(0), + FlatWorkGroupSizes(0, 0), + WavesPerEU(0, 0), DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}), DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}), LDSWaveSpillSize(0), @@ -62,14 +59,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) HasSpilledSGPRs(false), HasSpilledVGPRs(false), HasNonSpillStackObjects(false), - HasFlatInstructions(false), NumSpilledSGPRs(0), NumSpilledVGPRs(0), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), - DispatchID(false), KernargSegmentPtr(false), + DispatchID(false), FlatScratchInit(false), GridWorkgroupCountX(false), GridWorkgroupCountY(false), @@ -81,13 +77,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), WorkItemIDY(false), - WorkItemIDZ(false) { + WorkItemIDZ(false), + PrivateMemoryInputPtr(false) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); const Function *F = MF.getFunction(); PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); - const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); if (!AMDGPU::isShader(F->getCallingConv())) { KernargSegmentPtr = true; @@ -113,12 +110,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkItemIDY = true; bool MaySpill = ST.isVGPRSpillingEnabled(*F); - bool HasStackObjects = FrameInfo->hasStackObjects(); + bool HasStackObjects = FrameInfo.hasStackObjects(); if (HasStackObjects || MaySpill) PrivateSegmentWaveByteOffset = true; - if (ST.isAmdHsaOS()) { + if (ST.isAmdCodeObjectV2(MF)) { if (HasStackObjects || MaySpill) PrivateSegmentBuffer = true; @@ -127,6 +124,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F->hasFnAttribute("amdgpu-queue-ptr")) QueuePtr = true; + + if (F->hasFnAttribute("amdgpu-dispatch-id")) + DispatchID = true; + } else if (ST.isMesaGfxShader(MF)) { + if (HasStackObjects || MaySpill) + PrivateMemoryInputPtr = true; } // We don't need to worry about accessing spills with flat instructions. @@ -136,13 +139,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ST.isAmdHsaOS()) FlatScratchInit = true; - if (AMDGPU::isCompute(F->getCallingConv())) - MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F); - else - MaximumWorkGroupSize = ST.getWavefrontSize(); - - if (ST.debuggerReserveRegs()) - DebuggerReservedVGPRCount = 4; + FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); + WavesPerEU = ST.getWavesPerEU(*F); } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( @@ -174,6 +172,13 @@ unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) return KernargSegmentPtrUserSGPR; } +unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) { + DispatchIDUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return DispatchIDUserSGPR; +} + unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { FlatScratchInitUserSGPR = TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); @@ -181,6 +186,13 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { return FlatScratchInitUserSGPR; } +unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) { + PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return PrivateMemoryPtrUserSGPR; +} + SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg ( MachineFunction *MF, unsigned FrameIndex, @@ -191,9 +203,9 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg ( const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); - int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); + int64_t Offset = FrameInfo.getObjectOffset(FrameIndex); Offset += SubIdx * 4; unsigned LaneVGPRIdx = Offset / (64 * 4); @@ -223,8 +235,3 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg ( Spill.VGPR = LaneVGPRs[LaneVGPRIdx]; return Spill; } - -unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( - const MachineFunction &MF) const { - return MaximumWorkGroupSize; -} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index f5bd636..6fc8d18 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -23,12 +23,59 @@ namespace llvm { class MachineRegisterInfo; +class AMDGPUImagePseudoSourceValue : public PseudoSourceValue { +public: + explicit AMDGPUImagePseudoSourceValue() : + PseudoSourceValue(PseudoSourceValue::TargetCustom) { } + + bool isConstant(const MachineFrameInfo *) const override { + // This should probably be true for most images, but we will start by being + // conservative. + return false; + } + + bool isAliased(const MachineFrameInfo *) const override { + // FIXME: If we ever change image intrinsics to accept fat pointers, then + // this could be true for some cases. + return false; + } + + bool mayAlias(const MachineFrameInfo*) const override { + // FIXME: If we ever change image intrinsics to accept fat pointers, then + // this could be true for some cases. + return false; + } +}; + +class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue { +public: + explicit AMDGPUBufferPseudoSourceValue() : + PseudoSourceValue(PseudoSourceValue::TargetCustom) { } + + bool isConstant(const MachineFrameInfo *) const override { + // This should probably be true for most images, but we will start by being + // conservative. + return false; + } + + bool isAliased(const MachineFrameInfo *) const override { + // FIXME: If we ever change image intrinsics to accept fat pointers, then + // this could be true for some cases. + return false; + } + + bool mayAlias(const MachineFrameInfo*) const override { + // FIXME: If we ever change image intrinsics to accept fat pointers, then + // this could be true for some cases. + return false; + } +}; + /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // FIXME: This should be removed and getPreloadedValue moved here. - friend struct SIRegisterInfo; - void anchor() override; + friend class SIRegisterInfo; unsigned TIDReg; @@ -37,6 +84,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { unsigned ScratchRSrcReg; unsigned ScratchWaveOffsetReg; + // Input registers for non-HSA ABI + unsigned PrivateMemoryPtrUserSGPR; + // Input registers setup for the HSA ABI. // User SGPRs in allocation order. unsigned PrivateSegmentBufferUserSGPR; @@ -61,15 +111,22 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { unsigned PSInputAddr; bool ReturnsVoid; - unsigned MaximumWorkGroupSize; + // A pair of default/requested minimum/maximum flat work group sizes. + // Minimum - first, maximum - second. + std::pair<unsigned, unsigned> FlatWorkGroupSizes; + + // A pair of default/requested minimum/maximum number of waves per execution + // unit. Minimum - first, maximum - second. + std::pair<unsigned, unsigned> WavesPerEU; - // Number of reserved VGPRs for debugger usage. - unsigned DebuggerReservedVGPRCount; // Stack object indices for work group IDs. std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices; // Stack object indices for work item IDs. std::array<int, 3> DebuggerWorkItemIDStackObjectIndices; + AMDGPUBufferPseudoSourceValue BufferPSV; + AMDGPUImagePseudoSourceValue ImagePSV; + public: // FIXME: Make private unsigned LDSWaveSpillSize; @@ -83,7 +140,6 @@ private: bool HasSpilledSGPRs; bool HasSpilledVGPRs; bool HasNonSpillStackObjects; - bool HasFlatInstructions; unsigned NumSpilledSGPRs; unsigned NumSpilledVGPRs; @@ -92,8 +148,8 @@ private: bool PrivateSegmentBuffer : 1; bool DispatchPtr : 1; bool QueuePtr : 1; - bool DispatchID : 1; bool KernargSegmentPtr : 1; + bool DispatchID : 1; bool FlatScratchInit : 1; bool GridWorkgroupCountX : 1; bool GridWorkgroupCountY : 1; @@ -110,6 +166,11 @@ private: bool WorkItemIDY : 1; bool WorkItemIDZ : 1; + // Private memory buffer + // Compute directly in sgpr[0:1] + // Other shaders indirect 64-bits at sgpr[0:1] + bool PrivateMemoryInputPtr : 1; + MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); return AMDGPU::SGPR0 + NumUserSGPRs; @@ -143,7 +204,9 @@ public: unsigned addDispatchPtr(const SIRegisterInfo &TRI); unsigned addQueuePtr(const SIRegisterInfo &TRI); unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + unsigned addDispatchID(const SIRegisterInfo &TRI); unsigned addFlatScratchInit(const SIRegisterInfo &TRI); + unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI); // Add system SGPRs. unsigned addWorkGroupIDX() { @@ -192,14 +255,14 @@ public: return QueuePtr; } - bool hasDispatchID() const { - return DispatchID; - } - bool hasKernargSegmentPtr() const { return KernargSegmentPtr; } + bool hasDispatchID() const { + return DispatchID; + } + bool hasFlatScratchInit() const { return FlatScratchInit; } @@ -248,6 +311,10 @@ public: return WorkItemIDZ; } + bool hasPrivateMemoryInputPtr() const { + return PrivateMemoryInputPtr; + } + unsigned getNumUserSGPRs() const { return NumUserSGPRs; } @@ -284,6 +351,10 @@ public: return QueuePtrUserSGPR; } + unsigned getPrivateMemoryPtrUserSGPR() const { + return PrivateMemoryPtrUserSGPR; + } + bool hasSpilledSGPRs() const { return HasSpilledSGPRs; } @@ -308,14 +379,6 @@ public: HasNonSpillStackObjects = StackObject; } - bool hasFlatInstructions() const { - return HasFlatInstructions; - } - - void setHasFlatInstructions(bool UseFlat = true) { - HasFlatInstructions = UseFlat; - } - unsigned getNumSpilledSGPRs() const { return NumSpilledSGPRs; } @@ -352,9 +415,36 @@ public: ReturnsVoid = Value; } - /// \returns Number of reserved VGPRs for debugger usage. - unsigned getDebuggerReservedVGPRCount() const { - return DebuggerReservedVGPRCount; + /// \returns A pair of default/requested minimum/maximum flat work group sizes + /// for this function. + std::pair<unsigned, unsigned> getFlatWorkGroupSizes() const { + return FlatWorkGroupSizes; + } + + /// \returns Default/requested minimum flat work group size for this function. + unsigned getMinFlatWorkGroupSize() const { + return FlatWorkGroupSizes.first; + } + + /// \returns Default/requested maximum flat work group size for this function. + unsigned getMaxFlatWorkGroupSize() const { + return FlatWorkGroupSizes.second; + } + + /// \returns A pair of default/requested minimum/maximum number of waves per + /// execution unit. + std::pair<unsigned, unsigned> getWavesPerEU() const { + return WavesPerEU; + } + + /// \returns Default/requested minimum number of waves per execution unit. + unsigned getMinWavesPerEU() const { + return WavesPerEU.first; + } + + /// \returns Default/requested maximum number of waves per execution unit. + unsigned getMaxWavesPerEU() const { + return WavesPerEU.second; } /// \returns Stack object index for \p Dim's work group ID. @@ -413,7 +503,13 @@ public: llvm_unreachable("unexpected dimension"); } - unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; + const AMDGPUBufferPseudoSourceValue *getBufferPSV() const { + return &BufferPSV; + } + + const AMDGPUImagePseudoSourceValue *getImagePSV() const { + return &ImagePSV; + } }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index 7125b41..da86bbf 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -1,4 +1,4 @@ -//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===// +//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// // // The LLVM Compiler Infrastructure // @@ -13,12 +13,28 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "SIInstrInfo.h" #include "SIMachineScheduler.h" +#include "SIRegisterInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include <algorithm> +#include <cassert> +#include <map> +#include <set> +#include <utility> +#include <vector> using namespace llvm; @@ -77,11 +93,11 @@ using namespace llvm; // The block creation algorithm is divided into several steps, and several // variants can be tried during the scheduling process. // -// Second the order of the instructions inside the blocks is choosen. +// Second the order of the instructions inside the blocks is chosen. // At that step we do take into account only register usage and hiding // low latency instructions // -// Third the block order is choosen, there we try to hide high latencies +// Third the block order is chosen, there we try to hide high latencies // and keep register usage low. // // After the third step, a pass is done to improve the hiding of low @@ -89,7 +105,7 @@ using namespace llvm; // // Actually when talking about 'low latency' or 'high latency' it includes // both the latency to get the cache (or global mem) data go to the register, -// and the bandwith limitations. +// and the bandwidth limitations. // Increasing the number of active wavefronts helps hide the former, but it // doesn't solve the latter, thus why even if wavefront count is high, we have // to try have as many instructions hiding high latencies as possible. @@ -120,7 +136,6 @@ using namespace llvm; // 300-600 cycles. We do not specially take that into account when scheduling // As we expect the driver to be able to preload the constants soon. - // common code // #ifndef NDEBUG @@ -181,7 +196,6 @@ void SIScheduleBlock::addUnit(SUnit *SU) { } #ifndef NDEBUG - void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); @@ -209,7 +223,7 @@ void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, // we haven't waited for // . Low latencies // . All other instructions - // Goal is to get: low latency instructions - independant instructions + // Goal is to get: low latency instructions - independent instructions // - (eventually some more low latency instructions) // - instructions that depend on the first low latency instructions. // If in the block there is a lot of constant loads, the SGPR usage @@ -479,8 +493,7 @@ void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { void SIScheduleBlock::nodeScheduled(SUnit *SU) { // Is in TopReadySUs assert (!SU->NumPredsLeft); - std::vector<SUnit*>::iterator I = - std::find(TopReadySUs.begin(), TopReadySUs.end(), SU); + std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU); if (I == TopReadySUs.end()) { dbgs() << "Data Structure Bug in SI Scheduler\n"; llvm_unreachable(nullptr); @@ -589,9 +602,8 @@ void SIScheduleBlock::printDebug(bool full) { } } - dbgs() << "///////////////////////\n"; + dbgs() << "///////////////////////\n"; } - #endif // SIScheduleBlockCreator // @@ -600,8 +612,7 @@ SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) : DAG(DAG) { } -SIScheduleBlockCreator::~SIScheduleBlockCreator() { -} +SIScheduleBlockCreator::~SIScheduleBlockCreator() = default; SIScheduleBlocks SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { @@ -1059,8 +1070,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria unsigned Color = CurrentColoring[SU->NodeNum]; if (RealID.find(Color) == RealID.end()) { int ID = CurrentBlocks.size(); - BlockPtrs.push_back( - make_unique<SIScheduleBlock>(DAG, this, ID)); + BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID)); CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); RealID[Color] = ID; } @@ -1104,30 +1114,17 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria // Two functions taken from Codegen/MachineScheduler.cpp -/// If this iterator is a debug value, increment until reaching the End or a -/// non-debug instruction. -static MachineBasicBlock::const_iterator -nextIfDebug(MachineBasicBlock::const_iterator I, +/// Non-const version. +static MachineBasicBlock::iterator +nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::const_iterator End) { - for(; I != End; ++I) { + for (; I != End; ++I) { if (!I->isDebugValue()) break; } return I; } -/// Non-const version. -static MachineBasicBlock::iterator -nextIfDebug(MachineBasicBlock::iterator I, - MachineBasicBlock::const_iterator End) { - // Cast the return value to nonconst MachineInstr, then cast to an - // instr_iterator, which does not check for null, finally return a - // bundle_iterator. - return MachineBasicBlock::instr_iterator( - const_cast<MachineInstr*>( - &*nextIfDebug(MachineBasicBlock::const_iterator(I), End))); -} - void SIScheduleBlockCreator::topologicalSort() { unsigned DAGSize = CurrentBlocks.size(); std::vector<int> WorkList; @@ -1217,7 +1214,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() { DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); // Update LiveIntervals. - // Note: Moving all instructions and calling handleMove everytime + // Note: Moving all instructions and calling handleMove every time // is the most cpu intensive operation of the scheduler. // It would gain a lot if there was a way to recompute the // LiveIntervals for the entire scheduling region. @@ -1265,7 +1262,7 @@ void SIScheduleBlockCreator::fillStats() { for (unsigned i = 0, e = DAGSize; i != e; ++i) { int BlockIndice = TopDownIndex2Block[i]; SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; - if (Block->getPreds().size() == 0) + if (Block->getPreds().empty()) Block->Depth = 0; else { unsigned Depth = 0; @@ -1280,7 +1277,7 @@ void SIScheduleBlockCreator::fillStats() { for (unsigned i = 0, e = DAGSize; i != e; ++i) { int BlockIndice = BottomUpIndex2Block[i]; SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; - if (Block->getSuccs().size() == 0) + if (Block->getSuccs().empty()) Block->Height = 0; else { unsigned Height = 0; @@ -1654,20 +1651,15 @@ SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, // SIScheduleDAGMI // SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : - ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) { + ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) { SITII = static_cast<const SIInstrInfo*>(TII); SITRI = static_cast<const SIRegisterInfo*>(TRI); - VGPRSetID = SITRI->getVGPR32PressureSet(); - SGPRSetID = SITRI->getSGPR32PressureSet(); -} - -SIScheduleDAGMI::~SIScheduleDAGMI() { + VGPRSetID = SITRI->getVGPRPressureSet(); + SGPRSetID = SITRI->getSGPRPressureSet(); } -ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) { - return new SIScheduleDAGMI(C); -} +SIScheduleDAGMI::~SIScheduleDAGMI() = default; // Code adapted from scheduleDAG.cpp // Does a topological sort over the SUs. diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h index 117aed4..77c0735 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h @@ -1,4 +1,4 @@ -//===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===// +//===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -16,10 +16,16 @@ #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H #include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/RegisterPressure.h" - -using namespace llvm; +#include "llvm/CodeGen/ScheduleDAG.h" +#include <cassert> +#include <cstdint> +#include <map> +#include <memory> +#include <set> +#include <vector> namespace llvm { @@ -93,12 +99,10 @@ class SIScheduleBlock { public: SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC, unsigned ID): - DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(), - TopRPTracker(TopPressure), Scheduled(false), - HighLatencyBlock(false), ID(ID), - Preds(), Succs(), NumHighLatencySuccessors(0) {}; + DAG(DAG), BC(BC), TopRPTracker(TopPressure), Scheduled(false), + HighLatencyBlock(false), ID(ID), NumHighLatencySuccessors(0) {} - ~SIScheduleBlock() {}; + ~SIScheduleBlock() = default; unsigned getID() const { return ID; } @@ -146,7 +150,6 @@ public: bool isScheduled() { return Scheduled; } - // Needs the block to be scheduled inside // TODO: find a way to compute it. std::vector<unsigned> &getInternalAdditionnalRegUsage() { @@ -161,7 +164,7 @@ public: private: struct SISchedCandidate : SISchedulerCandidate { // The best SUnit candidate. - SUnit *SU; + SUnit *SU = nullptr; unsigned SGPRUsage; unsigned VGPRUsage; @@ -169,8 +172,7 @@ private: unsigned LowLatencyOffset; bool HasLowLatencyNonWaitedParent; - SISchedCandidate() - : SU(nullptr) {} + SISchedCandidate() = default; bool isValid() const { return SU; } @@ -341,17 +343,17 @@ public: SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, SISchedulerBlockSchedulerVariant Variant, SIScheduleBlocks BlocksStruct); - ~SIScheduleBlockScheduler() {}; + ~SIScheduleBlockScheduler() = default; - std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }; + std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; } - unsigned getVGPRUsage() { return maxVregUsage; }; - unsigned getSGPRUsage() { return maxSregUsage; }; + unsigned getVGPRUsage() { return maxVregUsage; } + unsigned getSGPRUsage() { return maxSregUsage; } private: struct SIBlockSchedCandidate : SISchedulerCandidate { // The best Block candidate. - SIScheduleBlock *Block; + SIScheduleBlock *Block = nullptr; bool IsHighLatency; int VGPRUsageDiff; @@ -360,8 +362,7 @@ private: unsigned LastPosHighLatParentScheduled; unsigned Height; - SIBlockSchedCandidate() - : Block(nullptr) {} + SIBlockSchedCandidate() = default; bool isValid() const { return Block; } @@ -409,9 +410,9 @@ class SIScheduler { SIScheduleBlockCreator BlockCreator; public: - SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}; + SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {} - ~SIScheduler() {}; + ~SIScheduler() = default; struct SIScheduleBlockResult scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, @@ -445,13 +446,13 @@ public: } MachineBasicBlock *getBB() { return BB; } - MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }; - MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }; + MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; } + MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; } LiveIntervals *getLIS() { return LIS; } MachineRegisterInfo *getMRI() { return &MRI; } const TargetRegisterInfo *getTRI() { return TRI; } - SUnit& getEntrySU() { return EntrySU; }; - SUnit& getExitSU() { return ExitSU; }; + SUnit& getEntrySU() { return EntrySU; } + SUnit& getExitSU() { return ExitSU; } void restoreSULinksLeft(); @@ -459,13 +460,14 @@ public: _Iterator End, unsigned &VgprUsage, unsigned &SgprUsage); + std::set<unsigned> getInRegs() { std::set<unsigned> InRegs; for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { InRegs.insert(RegMaskPair.RegUnit); } return InRegs; - }; + } unsigned getVGPRSetID() const { return VGPRSetID; } unsigned getSGPRSetID() const { return SGPRSetID; } @@ -486,6 +488,6 @@ public: std::vector<int> BottomUpIndex2SU; }; -} // namespace llvm +} // end namespace llvm -#endif /* SIMACHINESCHEDULER_H_ */ +#endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H diff --git a/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp new file mode 100644 index 0000000..4d2f917 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -0,0 +1,304 @@ +//===-- SIOptimizeExecMasking.cpp -----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-optimize-exec-masking" + +namespace { + +class SIOptimizeExecMasking : public MachineFunctionPass { +public: + static char ID; + +public: + SIOptimizeExecMasking() : MachineFunctionPass(ID) { + initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI optimize exec mask operations"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, + "SI optimize exec mask operations", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE, + "SI optimize exec mask operations", false, false) + +char SIOptimizeExecMasking::ID = 0; + +char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; + +/// If \p MI is a copy from exec, return the register copied to. +static unsigned isCopyFromExec(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::S_MOV_B64: + case AMDGPU::S_MOV_B64_term: { + const MachineOperand &Src = MI.getOperand(1); + if (Src.isReg() && Src.getReg() == AMDGPU::EXEC) + return MI.getOperand(0).getReg(); + } + } + + return AMDGPU::NoRegister; +} + +/// If \p MI is a copy to exec, return the register copied from. +static unsigned isCopyToExec(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::S_MOV_B64: { + const MachineOperand &Dst = MI.getOperand(0); + if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC) + return MI.getOperand(1).getReg(); + break; + } + case AMDGPU::S_MOV_B64_term: + llvm_unreachable("should have been replaced"); + } + + return AMDGPU::NoRegister; +} + +static unsigned getSaveExecOp(unsigned Opc) { + switch (Opc) { + case AMDGPU::S_AND_B64: + return AMDGPU::S_AND_SAVEEXEC_B64; + case AMDGPU::S_OR_B64: + return AMDGPU::S_OR_SAVEEXEC_B64; + case AMDGPU::S_XOR_B64: + return AMDGPU::S_XOR_SAVEEXEC_B64; + case AMDGPU::S_ANDN2_B64: + return AMDGPU::S_ANDN2_SAVEEXEC_B64; + case AMDGPU::S_ORN2_B64: + return AMDGPU::S_ORN2_SAVEEXEC_B64; + case AMDGPU::S_NAND_B64: + return AMDGPU::S_NAND_SAVEEXEC_B64; + case AMDGPU::S_NOR_B64: + return AMDGPU::S_NOR_SAVEEXEC_B64; + case AMDGPU::S_XNOR_B64: + return AMDGPU::S_XNOR_SAVEEXEC_B64; + default: + return AMDGPU::INSTRUCTION_LIST_END; + } +} + +// These are only terminators to get correct spill code placement during +// register allocation, so turn them back into normal instructions. Only one of +// these is expected per block. +static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::S_MOV_B64_term: { + MI.setDesc(TII.get(AMDGPU::COPY)); + return true; + } + case AMDGPU::S_XOR_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_XOR_B64)); + return true; + } + case AMDGPU::S_ANDN2_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64)); + return true; + } + default: + return false; + } +} + +static MachineBasicBlock::reverse_iterator fixTerminators( + const SIInstrInfo &TII, + MachineBasicBlock &MBB) { + MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); + for (; I != E; ++I) { + if (!I->isTerminator()) + return I; + + if (removeTerminatorBit(TII, *I)) + return I; + } + + return E; +} + +static MachineBasicBlock::reverse_iterator findExecCopy( + const SIInstrInfo &TII, + MachineBasicBlock &MBB, + MachineBasicBlock::reverse_iterator I, + unsigned CopyToExec) { + const unsigned InstLimit = 25; + + auto E = MBB.rend(); + for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { + unsigned CopyFromExec = isCopyFromExec(*I); + if (CopyFromExec != AMDGPU::NoRegister) + return I; + } + + return E; +} + +// XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly +// repor tthe register as unavailable because a super-register with a lane mask +// as unavailable. +static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { + for (MachineBasicBlock *Succ : MBB.successors()) { + if (Succ->isLiveIn(Reg)) + return true; + } + + return false; +} + +bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + // Optimize sequences emitted for control flow lowering. They are originally + // emitted as the separate operations because spill code may need to be + // inserted for the saved copy of exec. + // + // x = copy exec + // z = s_<op>_b64 x, y + // exec = copy z + // => + // x = s_<op>_saveexec_b64 y + // + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB); + MachineBasicBlock::reverse_iterator E = MBB.rend(); + if (I == E) + continue; + + unsigned CopyToExec = isCopyToExec(*I); + if (CopyToExec == AMDGPU::NoRegister) + continue; + + // Scan backwards to find the def. + auto CopyToExecInst = &*I; + auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec); + if (CopyFromExecInst == E) + continue; + + if (isLiveOut(MBB, CopyToExec)) { + // The copied register is live out and has a second use in another block. + DEBUG(dbgs() << "Exec copy source register is live out\n"); + continue; + } + + unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); + MachineInstr *SaveExecInst = nullptr; + SmallVector<MachineInstr *, 4> OtherUseInsts; + + for (MachineBasicBlock::iterator J + = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); + J != JE; ++J) { + if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) { + DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); + // Make sure this is inserted after any VALU ops that may have been + // scheduled in between. + SaveExecInst = nullptr; + break; + } + + if (J->modifiesRegister(CopyToExec, TRI)) { + if (SaveExecInst) { + DEBUG(dbgs() << "Multiple instructions modify " + << PrintReg(CopyToExec, TRI) << '\n'); + SaveExecInst = nullptr; + break; + } + + unsigned SaveExecOp = getSaveExecOp(J->getOpcode()); + if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) + break; + + if (J->readsRegister(CopyFromExec, TRI)) { + SaveExecInst = &*J; + DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); + continue; + } else { + DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n'); + break; + } + } + + if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) { + assert(SaveExecInst != &*J); + OtherUseInsts.push_back(&*J); + } + } + + if (!SaveExecInst) + continue; + + DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); + + MachineOperand &Src0 = SaveExecInst->getOperand(1); + MachineOperand &Src1 = SaveExecInst->getOperand(2); + + MachineOperand *OtherOp = nullptr; + + if (Src0.isReg() && Src0.getReg() == CopyFromExec) { + OtherOp = &Src1; + } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { + if (!SaveExecInst->isCommutable()) + break; + + OtherOp = &Src0; + } else + llvm_unreachable("unexpected"); + + CopyFromExecInst->eraseFromParent(); + + auto InsPt = SaveExecInst->getIterator(); + const DebugLoc &DL = SaveExecInst->getDebugLoc(); + + BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), + CopyFromExec) + .addReg(OtherOp->getReg()); + SaveExecInst->eraseFromParent(); + + CopyToExecInst->eraseFromParent(); + + for (MachineInstr *OtherInst : OtherUseInsts) { + OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC, + AMDGPU::NoSubRegister, *TRI); + } + } + + return true; + +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 347c33f..a1ed5e8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -24,52 +24,11 @@ using namespace llvm; -static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) { - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - unsigned SIMDPerCU = 4; - - unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize(); - return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) / - MaxInvocationsPerWave; -} - -static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF); - - unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment; - unsigned ReservedSGPRCount; - - if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { - TotalSGPRCountPerSIMD = 800; - AddressableSGPRCount = 102; - SGPRUsageAlignment = 16; - ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK - } else { - TotalSGPRCountPerSIMD = 512; - AddressableSGPRCount = 104; - SGPRUsageAlignment = 8; - ReservedSGPRCount = 2; // VCC - } +static cl::opt<bool> EnableSpillSGPRToSMEM( + "amdgpu-spill-sgpr-to-smem", + cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), + cl::init(false)); - unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD); - MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment); - - if (ST.hasSGPRInitBug()) - MaxSGPRCount = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; - - return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount); -} - -static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) { - unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF); - unsigned TotalVGPRCountPerSIMD = 256; - unsigned VGPRUsageAlignment = 4; - - return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD, - VGPRUsageAlignment); -} static bool hasPressureSet(const int *PSets, unsigned PSetID) { for (unsigned i = 0; PSets[i] != -1; ++i) { @@ -95,19 +54,38 @@ SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(), VGPRPressureSets(getNumRegPressureSets()) { unsigned NumRegPressureSets = getNumRegPressureSets(); - SGPR32SetID = NumRegPressureSets; - VGPR32SetID = NumRegPressureSets; - for (unsigned i = 0; i < NumRegPressureSets; ++i) { - if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0) - SGPR32SetID = i; - else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0) - VGPR32SetID = i; + SGPRSetID = NumRegPressureSets; + VGPRSetID = NumRegPressureSets; + for (unsigned i = 0; i < NumRegPressureSets; ++i) { classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); } - assert(SGPR32SetID < NumRegPressureSets && - VGPR32SetID < NumRegPressureSets); + + // Determine the number of reg units for each pressure set. + std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0); + for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) { + const int *PSets = getRegUnitPressureSets(i); + for (unsigned j = 0; PSets[j] != -1; ++j) { + ++PressureSetRegUnits[PSets[j]]; + } + } + + unsigned VGPRMax = 0, SGPRMax = 0; + for (unsigned i = 0; i < NumRegPressureSets; ++i) { + if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) { + VGPRSetID = i; + VGPRMax = PressureSetRegUnits[i]; + continue; + } + if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) { + SGPRSetID = i; + SGPRMax = PressureSetRegUnits[i]; + } + } + + assert(SGPRSetID < NumRegPressureSets && + VGPRSetID < NumRegPressureSets); } void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { @@ -119,14 +97,14 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4; + unsigned BaseIdx = alignDown(getMaxNumSGPRs(MF), 4) - 4; unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); } unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const { - unsigned RegCount = getMaxWorkGroupSGPRCount(MF); + unsigned RegCount = getMaxNumSGPRs(MF); unsigned Reg; // Try to place it in a hole after PrivateSegmentbufferReg. @@ -161,18 +139,16 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); - unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF); - unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF); - - unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); - unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); - for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) { + unsigned MaxNumSGPRs = getMaxNumSGPRs(MF); + unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); reserveRegisterTuples(Reserved, Reg); } - - for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) { + unsigned MaxNumVGPRs = getMaxNumVGPRs(MF); + unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); reserveRegisterTuples(Reserved, Reg); } @@ -194,49 +170,26 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } - // Reserve registers for debugger usage if "amdgpu-debugger-reserve-trap-regs" - // attribute was specified. - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - if (ST.debuggerReserveRegs()) { - unsigned ReservedVGPRFirst = - MaxWorkGroupVGPRCount - MFI->getDebuggerReservedVGPRCount(); - for (unsigned i = ReservedVGPRFirst; i < MaxWorkGroupVGPRCount; ++i) { - unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); - reserveRegisterTuples(Reserved, Reg); - } - } - return Reserved; } -unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const { - const SISubtarget &STI = MF.getSubtarget<SISubtarget>(); - // FIXME: We should adjust the max number of waves based on LDS size. - unsigned SGPRLimit = getNumSGPRsAllowed(STI, STI.getMaxWavesPerCU()); - unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); - - unsigned VSLimit = SGPRLimit + VGPRLimit; - - if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) { - // FIXME: This is a hack. We should never be considering the pressure of - // these since no virtual register should ever have this class. - return VSLimit; - } - - if (SGPRPressureSets.test(Idx)) - return SGPRLimit; - - return VGPRLimit; -} - bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { - return Fn.getFrameInfo()->hasStackObjects(); + return Fn.getFrameInfo().hasStackObjects(); } bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { - return MF.getFrameInfo()->hasStackObjects(); + return MF.getFrameInfo().hasStackObjects(); +} + +bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( + const MachineFunction &MF) const { + // m0 is needed for the scalar store offset. m0 is unallocatable, so we can't + // create a virtual register for it during frame index elimination, so the + // scavenger is directly needed. + return MF.getFrameInfo().hasStackObjects() && + MF.getSubtarget<SISubtarget>().hasScalarStores() && + MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs(); } bool SIRegisterInfo::requiresVirtualBaseRegisters( @@ -250,6 +203,14 @@ bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const return true; } +int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const { + assert(SIInstrInfo::isMUBUF(*MI)); + + int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::offset); + return MI->getOperand(OffIdx).getImm(); +} + int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const { if (!SIInstrInfo::isMUBUF(*MI)) @@ -259,13 +220,16 @@ int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, AMDGPU::OpName::vaddr) && "Should never see frame index on non-address operand"); - int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::offset); - return MI->getOperand(OffIdx).getImm(); + return getMUBUFInstrOffset(MI); } bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { - return MI->mayLoadOrStore(); + if (!MI->mayLoadOrStore()) + return false; + + int64_t FullOffset = Offset + getMUBUFInstrOffset(MI); + + return !isUInt<12>(FullOffset); } void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, @@ -290,14 +254,19 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(Offset); + BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) + .addFrameIndex(FrameIdx); + BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg) .addReg(UnusedCarry, RegState::Define | RegState::Dead) .addReg(OffsetReg, RegState::Kill) - .addFrameIndex(FrameIdx); + .addReg(FIReg); } void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, @@ -328,40 +297,21 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); int64_t NewOffset = OffsetOp->getImm() + Offset; - if (isUInt<12>(NewOffset)) { - // If we have a legal offset, fold it directly into the instruction. - FIOp->ChangeToRegister(BaseReg, false); - OffsetOp->setImm(NewOffset); - return; - } - - // The offset is not legal, so we must insert an add of the offset. - MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned NewReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - DebugLoc DL = MI.getDebugLoc(); - - assert(Offset != 0 && "Non-zero offset expected"); - - unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + assert(isUInt<12>(NewOffset) && "offset should be legal"); - // In the case the instruction already had an immediate offset, here only - // the requested new offset is added because we are leaving the original - // immediate in place. - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addImm(Offset); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), NewReg) - .addReg(UnusedCarry, RegState::Define | RegState::Dead) - .addReg(OffsetReg, RegState::Kill) - .addReg(BaseReg); - - FIOp->ChangeToRegister(NewReg, false); + FIOp->ChangeToRegister(BaseReg, false); + OffsetOp->setImm(NewOffset); } bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, int64_t Offset) const { - return SIInstrInfo::isMUBUF(*MI) && isUInt<12>(Offset); + if (!SIInstrInfo::isMUBUF(*MI)) + return false; + + int64_t NewOffset = Offset + getMUBUFInstrOffset(MI); + + return isUInt<12>(NewOffset); } const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( @@ -407,31 +357,107 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { } } -void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, - const MachineOperand *SrcDst, - unsigned ScratchRsrcReg, - unsigned ScratchOffset, - int64_t Offset, - RegScavenger *RS) const { +static int getOffsetMUBUFStore(unsigned Opc) { + switch (Opc) { + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + return AMDGPU::BUFFER_STORE_DWORD_OFFSET; + case AMDGPU::BUFFER_STORE_BYTE_OFFEN: + return AMDGPU::BUFFER_STORE_BYTE_OFFSET; + case AMDGPU::BUFFER_STORE_SHORT_OFFEN: + return AMDGPU::BUFFER_STORE_SHORT_OFFSET; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: + return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; + case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: + return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; + default: + return -1; + } +} + +static int getOffsetMUBUFLoad(unsigned Opc) { + switch (Opc) { + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: + return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; + case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: + return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; + case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: + return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; + case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: + return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; + case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: + return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: + return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: + return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; + default: + return -1; + } +} - unsigned Value = SrcDst->getReg(); - bool IsKill = SrcDst->isKill(); +// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not +// need to handle the case where an SGPR may need to be spilled while spilling. +static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, + MachineFrameInfo &MFI, + MachineBasicBlock::iterator MI, + int Index, + int64_t Offset) { + MachineBasicBlock *MBB = MI->getParent(); + const DebugLoc &DL = MI->getDebugLoc(); + bool IsStore = MI->mayStore(); + + unsigned Opc = MI->getOpcode(); + int LoadStoreOp = IsStore ? + getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); + if (LoadStoreOp == -1) + return false; + + unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg(); + + BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) + .addReg(Reg, getDefRegState(!IsStore)) + .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) + .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + return true; +} + +void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, + int Index, + unsigned ValueReg, + bool IsKill, + unsigned ScratchRsrcReg, + unsigned ScratchOffsetReg, + int64_t InstOffset, + MachineMemOperand *MMO, + RegScavenger *RS) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); + const MachineFrameInfo &MFI = MF->getFrameInfo(); - DebugLoc DL = MI->getDebugLoc(); - bool IsStore = MI->mayStore(); + const MCInstrDesc &Desc = TII->get(LoadStoreOp); + const DebugLoc &DL = MI->getDebugLoc(); + bool IsStore = Desc.mayStore(); bool RanOutOfSGPRs = false; bool Scavenged = false; - unsigned SOffset = ScratchOffset; - unsigned OriginalImmOffset = Offset; + unsigned SOffset = ScratchOffsetReg; - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); + unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32; unsigned Size = NumSubRegs * 4; + int64_t Offset = InstOffset + MFI.getObjectOffset(Index); + const int64_t OriginalImmOffset = Offset; + + unsigned Align = MFI.getObjectAlignment(Index); + const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); if (!isUInt<12>(Offset + Size)) { SOffset = AMDGPU::NoRegister; @@ -450,20 +476,23 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, // subtract the offset after the spill to return ScratchOffset to it's // original value. RanOutOfSGPRs = true; - SOffset = ScratchOffset; + SOffset = ScratchOffsetReg; } else { Scavenged = true; } + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) - .addReg(ScratchOffset) - .addImm(Offset); + .addReg(ScratchOffsetReg) + .addImm(Offset); + Offset = 0; } - for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { - unsigned SubReg = NumSubRegs > 1 ? - getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : - Value; + const unsigned EltSize = 4; + + for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { + unsigned SubReg = NumSubRegs == 1 ? + ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i)); unsigned SOffsetRegState = 0; unsigned SrcDstRegState = getDefRegState(!IsStore); @@ -473,23 +502,324 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, SrcDstRegState |= getKillRegState(IsKill); } - BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .addReg(SubReg, getDefRegState(!IsStore)) + MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); + MachineMemOperand *NewMMO + = MF->getMachineMemOperand(PInfo, MMO->getFlags(), + EltSize, MinAlign(Align, EltSize * i)); + + auto MIB = BuildMI(*MBB, MI, DL, Desc) + .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) .addReg(ScratchRsrcReg) .addReg(SOffset, SOffsetRegState) .addImm(Offset) .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe - .addReg(Value, RegState::Implicit | SrcDstRegState) - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + .addMemOperand(NewMMO); + + if (NumSubRegs > 1) + MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); } + if (RanOutOfSGPRs) { // Subtract the offset we added to the ScratchOffset register. - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset) - .addReg(ScratchOffset) - .addImm(OriginalImmOffset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) + .addReg(ScratchOffsetReg) + .addImm(OriginalImmOffset); + } +} + +static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize, + bool Store) { + if (SuperRegSize % 16 == 0) { + return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR : + AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR }; + } + + if (SuperRegSize % 8 == 0) { + return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR : + AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR }; } + + return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR : + AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; +} + +void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, + int Index, + RegScavenger *RS) const { + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + unsigned SuperReg = MI->getOperand(0).getReg(); + bool IsKill = MI->getOperand(0).isKill(); + const DebugLoc &DL = MI->getDebugLoc(); + + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + + bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; + + assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); + + unsigned OffsetReg = AMDGPU::M0; + unsigned M0CopyReg = AMDGPU::NoRegister; + + if (SpillToSMEM) { + if (RS->isRegUsed(AMDGPU::M0)) { + M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) + .addReg(AMDGPU::M0); + } + } + + unsigned ScalarStoreOp; + unsigned EltSize = 4; + const TargetRegisterClass *RC = getPhysRegClass(SuperReg); + if (SpillToSMEM && isSGPRClass(RC)) { + // XXX - if private_element_size is larger than 4 it might be useful to be + // able to spill wider vmem spills. + std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true); + } + + ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); + unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); + + // SubReg carries the "Kill" flag when SubReg == SuperReg. + unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + unsigned SubReg = NumSubRegs == 1 ? + SuperReg : getSubReg(SuperReg, SplitParts[i]); + + if (SpillToSMEM) { + int64_t FrOffset = FrameInfo.getObjectOffset(Index); + + // The allocated memory size is really the wavefront size * the frame + // index size. The widest register class is 64 bytes, so a 4-byte scratch + // allocation is enough to spill this in a single stack object. + // + // FIXME: Frame size/offsets are computed earlier than this, so the extra + // space is still unnecessarily allocated. + + unsigned Align = FrameInfo.getObjectAlignment(Index); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + EltSize, MinAlign(Align, EltSize * i)); + + // SMEM instructions only support a single offset, so increment the wave + // offset. + + int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); + if (Offset != 0) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(Offset); + } else { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } + + BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) + .addReg(SubReg, getKillRegState(IsKill)) // sdata + .addReg(MFI->getScratchRSrcReg()) // sbase + .addReg(OffsetReg, RegState::Kill) // soff + .addImm(0) // glc + .addMemOperand(MMO); + + continue; + } + + struct SIMachineFunctionInfo::SpilledReg Spill = + MFI->getSpilledReg(MF, Index, i); + if (Spill.hasReg()) { + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill.VGPR) + .addReg(SubReg, getKillRegState(IsKill)) + .addImm(Spill.Lane); + + // FIXME: Since this spills to another register instead of an actual + // frame index, we should delete the frame index when all references to + // it are fixed. + } else { + // Spill SGPR to a frame index. + // TODO: Should VI try to spill to VGPR and then spill to SMEM? + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + // TODO: Should VI try to spill to VGPR and then spill to SMEM? + + MachineInstrBuilder Mov + = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addReg(SubReg, SubKillState); + + + // There could be undef components of a spilled super register. + // TODO: Can we detect this and skip the spill? + if (NumSubRegs > 1) { + // The last implicit use of the SuperReg carries the "Kill" flag. + unsigned SuperKillState = 0; + if (i + 1 == e) + SuperKillState |= getKillRegState(IsKill); + Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); + } + + unsigned Align = FrameInfo.getObjectAlignment(Index); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + EltSize, MinAlign(Align, EltSize * i)); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) + .addReg(TmpReg, RegState::Kill) // src + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srrsrc + .addReg(MFI->getScratchWaveOffsetReg()) // soffset + .addImm(i * 4) // offset + .addMemOperand(MMO); + } + } + + if (M0CopyReg != AMDGPU::NoRegister) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) + .addReg(M0CopyReg, RegState::Kill); + } + + MI->eraseFromParent(); + MFI->addToSpilledSGPRs(NumSubRegs); +} + +void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, + int Index, + RegScavenger *RS) const { + MachineFunction *MF = MI->getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineBasicBlock *MBB = MI->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const DebugLoc &DL = MI->getDebugLoc(); + + unsigned SuperReg = MI->getOperand(0).getReg(); + bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; + + assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); + + unsigned OffsetReg = AMDGPU::M0; + unsigned M0CopyReg = AMDGPU::NoRegister; + + if (SpillToSMEM) { + if (RS->isRegUsed(AMDGPU::M0)) { + M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) + .addReg(AMDGPU::M0); + } + } + + unsigned EltSize = 4; + unsigned ScalarLoadOp; + + const TargetRegisterClass *RC = getPhysRegClass(SuperReg); + if (SpillToSMEM && isSGPRClass(RC)) { + // XXX - if private_element_size is larger than 4 it might be useful to be + // able to spill wider vmem spills. + std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false); + } + + ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); + unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); + + // SubReg carries the "Kill" flag when SubReg == SuperReg. + int64_t FrOffset = FrameInfo.getObjectOffset(Index); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + unsigned SubReg = NumSubRegs == 1 ? + SuperReg : getSubReg(SuperReg, SplitParts[i]); + + if (SpillToSMEM) { + // FIXME: Size may be > 4 but extra bytes wasted. + unsigned Align = FrameInfo.getObjectAlignment(Index); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, + EltSize, MinAlign(Align, EltSize * i)); + + // Add i * 4 offset + int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); + if (Offset != 0) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(Offset); + } else { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } + + auto MIB = + BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) + .addReg(MFI->getScratchRSrcReg()) // sbase + .addReg(OffsetReg, RegState::Kill) // soff + .addImm(0) // glc + .addMemOperand(MMO); + + if (NumSubRegs > 1) + MIB.addReg(SuperReg, RegState::ImplicitDefine); + + continue; + } + + SIMachineFunctionInfo::SpilledReg Spill + = MFI->getSpilledReg(MF, Index, i); + + if (Spill.hasReg()) { + auto MIB = + BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + SubReg) + .addReg(Spill.VGPR) + .addImm(Spill.Lane); + + if (NumSubRegs > 1) + MIB.addReg(SuperReg, RegState::ImplicitDefine); + } else { + // Restore SGPR from a stack slot. + // FIXME: We should use S_LOAD_DWORD here for VI. + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned Align = FrameInfo.getObjectAlignment(Index); + + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); + + MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, + MachineMemOperand::MOLoad, EltSize, + MinAlign(Align, EltSize * i)); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srsrc + .addReg(MFI->getScratchWaveOffsetReg()) // soffset + .addImm(i * 4) // offset + .addMemOperand(MMO); + + auto MIB = + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) + .addReg(TmpReg, RegState::Kill); + + if (NumSubRegs > 1) + MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + } + } + + if (M0CopyReg != AMDGPU::NoRegister) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) + .addReg(M0CopyReg, RegState::Kill); + } + + MI->eraseFromParent(); } void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, @@ -499,7 +829,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -514,66 +844,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: { - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - unsigned SuperReg = MI->getOperand(0).getReg(); - bool IsKill = MI->getOperand(0).isKill(); - // SubReg carries the "Kill" flag when SubReg == SuperReg. - unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = getPhysRegSubReg(SuperReg, - &AMDGPU::SGPR_32RegClass, i); - - struct SIMachineFunctionInfo::SpilledReg Spill = - MFI->getSpilledReg(MF, Index, i); - - if (Spill.hasReg()) { - BuildMI(*MBB, MI, DL, - TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), - Spill.VGPR) - .addReg(SubReg, getKillRegState(IsKill)) - .addImm(Spill.Lane); - - // FIXME: Since this spills to another register instead of an actual - // frame index, we should delete the frame index when all references to - // it are fixed. - } else { - // Spill SGPR to a frame index. - // FIXME we should use S_STORE_DWORD here for VI. - MachineInstrBuilder Mov - = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) - .addReg(SubReg, SubKillState); - - - // There could be undef components of a spilled super register. - // TODO: Can we detect this and skip the spill? - if (NumSubRegs > 1) { - // The last implicit use of the SuperReg carries the "Kill" flag. - unsigned SuperKillState = 0; - if (i + 1 == e) - SuperKillState |= getKillRegState(IsKill); - Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); - } - - unsigned Size = FrameInfo->getObjectSize(Index); - unsigned Align = FrameInfo->getObjectAlignment(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, - Size, Align); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpReg, RegState::Kill) // src - .addFrameIndex(Index) // frame_idx - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset - .addImm(i * 4) // offset - .addMemOperand(MMO); - } - } - MI->eraseFromParent(); - MFI->addToSpilledSGPRs(NumSubRegs); + spillSGPR(MI, Index, RS); break; } @@ -583,49 +854,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: { - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), - &AMDGPU::SGPR_32RegClass, i); - struct SIMachineFunctionInfo::SpilledReg Spill = - MFI->getSpilledReg(MF, Index, i); - - if (Spill.hasReg()) { - BuildMI(*MBB, MI, DL, - TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), - SubReg) - .addReg(Spill.VGPR) - .addImm(Spill.Lane) - .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); - } else { - // Restore SGPR from a stack slot. - // FIXME: We should use S_LOAD_DWORD here for VI. - - unsigned Align = FrameInfo->getObjectAlignment(Index); - unsigned Size = FrameInfo->getObjectSize(Index); - - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index); - - MachineMemOperand *MMO = MF->getMachineMemOperand( - PtrInfo, MachineMemOperand::MOLoad, Size, Align); - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) - .addFrameIndex(Index) // frame_idx - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset - .addImm(i * 4) // offset - .addMemOperand(MMO); - BuildMI(*MBB, MI, DL, - TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) - .addReg(TmpReg, RegState::Kill) - .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); - } - } - - MI->eraseFromParent(); + restoreSGPR(MI, Index, RS); break; } @@ -635,34 +864,62 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: - case AMDGPU::SI_SPILL_V32_SAVE: - buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, - TII->getNamedOperand(*MI, AMDGPU::OpName::src), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), - FrameInfo->getObjectOffset(Index) + - TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); - MI->eraseFromParent(); + case AMDGPU::SI_SPILL_V32_SAVE: { + const MachineOperand *VData = TII->getNamedOperand(*MI, + AMDGPU::OpName::vdata); + buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, + Index, + VData->getReg(), VData->isKill(), + TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), + *MI->memoperands_begin(), + RS); MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); + MI->eraseFromParent(); break; + } case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: { - buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, - TII->getNamedOperand(*MI, AMDGPU::OpName::dst), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), - FrameInfo->getObjectOffset(Index) + - TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); + const MachineOperand *VData = TII->getNamedOperand(*MI, + AMDGPU::OpName::vdata); + + buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + Index, + VData->getReg(), VData->isKill(), + TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), + *MI->memoperands_begin(), + RS); MI->eraseFromParent(); break; } default: { - int64_t Offset = FrameInfo->getObjectOffset(Index); + if (TII->isMUBUF(*MI)) { + // Disable offen so we don't need a 0 vgpr base. + assert(static_cast<int>(FIOperandNum) == + AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::vaddr)); + + int64_t Offset = FrameInfo.getObjectOffset(Index); + int64_t OldImm + = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); + int64_t NewOffset = OldImm + Offset; + + if (isUInt<12>(NewOffset) && + buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { + MI->eraseFromParent(); + break; + } + } + + int64_t Offset = FrameInfo.getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -770,7 +1027,8 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return RC; // We can assume that each lane corresponds to one 32-bit register. - unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx)); + LaneBitmask::Type Mask = getSubRegIndexLaneMask(SubIdx).getAsInteger(); + unsigned Count = countPopulation(Mask); if (isSGPRClass(RC)) { switch (Count) { case 1: @@ -812,7 +1070,7 @@ bool SIRegisterInfo::shouldRewriteCopySrc( // We want to prefer the smallest register class possible, so we don't want to // stop and rewrite on anything that looks like a subregister // extract. Operations mostly don't care about the super register class, so we - // only want to stop on the most basic of copies between the smae register + // only want to stop on the most basic of copies between the same register // class. // // e.g. if we have something like @@ -828,80 +1086,6 @@ bool SIRegisterInfo::shouldRewriteCopySrc( return getCommonSubClass(DefRC, SrcRC) != nullptr; } -unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, - const TargetRegisterClass *SubRC, - unsigned Channel) const { - - switch (Reg) { - case AMDGPU::VCC: - switch(Channel) { - case 0: return AMDGPU::VCC_LO; - case 1: return AMDGPU::VCC_HI; - default: llvm_unreachable("Invalid SubIdx for VCC"); break; - } - - case AMDGPU::TBA: - switch(Channel) { - case 0: return AMDGPU::TBA_LO; - case 1: return AMDGPU::TBA_HI; - default: llvm_unreachable("Invalid SubIdx for TBA"); break; - } - - case AMDGPU::TMA: - switch(Channel) { - case 0: return AMDGPU::TMA_LO; - case 1: return AMDGPU::TMA_HI; - default: llvm_unreachable("Invalid SubIdx for TMA"); break; - } - - case AMDGPU::FLAT_SCR: - switch (Channel) { - case 0: - return AMDGPU::FLAT_SCR_LO; - case 1: - return AMDGPU::FLAT_SCR_HI; - default: - llvm_unreachable("Invalid SubIdx for FLAT_SCR"); - } - break; - - case AMDGPU::EXEC: - switch (Channel) { - case 0: - return AMDGPU::EXEC_LO; - case 1: - return AMDGPU::EXEC_HI; - default: - llvm_unreachable("Invalid SubIdx for EXEC"); - } - break; - } - - const TargetRegisterClass *RC = getPhysRegClass(Reg); - // 32-bit registers don't have sub-registers, so we can just return the - // Reg. We need to have this check here, because the calculation below - // using getHWRegIndex() will fail with special 32-bit registers like - // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0. - if (RC->getSize() == 4) { - assert(Channel == 0); - return Reg; - } - - unsigned Index = getHWRegIndex(Reg); - return SubRC->getRegister(Index + Channel); -} - -bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { - return OpType == AMDGPU::OPERAND_REG_IMM32; -} - -bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { - if (opCanUseLiteralConstant(OpType)) - return true; - - return OpType == AMDGPU::OPERAND_REG_INLINE_C; -} - // FIXME: Most of these are flexible with HSA and we don't need to reserve them // as input registers if unused. Whether the dispatch ptr is necessary should be // easy to detect from used intrinsics. Scratch setup is harder to know. @@ -924,14 +1108,18 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: - assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations"); - assert(MFI->hasPrivateSegmentBuffer()); - return MFI->PrivateSegmentBufferUserSGPR; + if (ST.isAmdCodeObjectV2(MF)) { + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; + } + assert(MFI->hasPrivateMemoryInputPtr()); + return MFI->PrivateMemoryPtrUserSGPR; case SIRegisterInfo::KERNARG_SEGMENT_PTR: assert(MFI->hasKernargSegmentPtr()); return MFI->KernargSegmentPtrUserSGPR; case SIRegisterInfo::DISPATCH_ID: - llvm_unreachable("unimplemented"); + assert(MFI->hasDispatchID()); + return MFI->DispatchIDUserSGPR; case SIRegisterInfo::FLAT_SCRATCH_INIT: assert(MFI->hasFlatScratchInit()); return MFI->FlatScratchInitUserSGPR; @@ -968,50 +1156,323 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, return AMDGPU::NoRegister; } -unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { - switch(WaveCount) { - case 10: return 24; - case 9: return 28; - case 8: return 32; - case 7: return 36; - case 6: return 40; - case 5: return 48; - case 4: return 64; - case 3: return 84; - case 2: return 128; - default: return 256; +unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const { + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return 800; + return 512; +} + +unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const { + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return 102; + return 104; +} + +unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST, + const SIMachineFunctionInfo &MFI) const { + if (MFI.hasFlatScratchInit()) { + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return 6; // FLAT_SCRATCH, XNACK, VCC (in that order) + + if (ST.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) + return 4; // FLAT_SCRATCH, VCC (in that order) } + + if (ST.isXNACKEnabled()) + return 4; // XNACK, VCC (in that order) + + return 2; // VCC. } -unsigned SIRegisterInfo::getNumSGPRsAllowed(const SISubtarget &ST, - unsigned WaveCount) const { - if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { - switch (WaveCount) { +unsigned SIRegisterInfo::getMinNumSGPRs(const SISubtarget &ST, + unsigned WavesPerEU) const { + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + switch (WavesPerEU) { + case 0: return 0; + case 10: return 0; + case 9: return 0; + case 8: return 81; + default: return 97; + } + } else { + switch (WavesPerEU) { + case 0: return 0; + case 10: return 0; + case 9: return 49; + case 8: return 57; + case 7: return 65; + case 6: return 73; + case 5: return 81; + default: return 97; + } + } +} + +unsigned SIRegisterInfo::getMaxNumSGPRs(const SISubtarget &ST, + unsigned WavesPerEU, + bool Addressable) const { + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + switch (WavesPerEU) { + case 0: return 80; case 10: return 80; case 9: return 80; case 8: return 96; - default: return 102; + default: return Addressable ? getNumAddressableSGPRs(ST) : 112; } } else { - switch(WaveCount) { + switch (WavesPerEU) { + case 0: return 48; case 10: return 48; case 9: return 56; case 8: return 64; case 7: return 72; case 6: return 80; case 5: return 96; - default: return 103; + default: return getNumAddressableSGPRs(ST); } } } -bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, - unsigned Reg) const { - const TargetRegisterClass *RC; +unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const { + const Function &F = *MF.getFunction(); + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + + // Compute maximum number of SGPRs function can use using default/requested + // minimum number of waves per execution unit. + std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); + unsigned MaxNumSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, false); + unsigned MaxNumAddressableSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, true); + + // Check if maximum number of SGPRs was explicitly requested using + // "amdgpu-num-sgpr" attribute. + if (F.hasFnAttribute("amdgpu-num-sgpr")) { + unsigned Requested = AMDGPU::getIntegerAttribute( + F, "amdgpu-num-sgpr", MaxNumSGPRs); + + // Make sure requested value does not violate subtarget's specifications. + if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI))) + Requested = 0; + + // If more SGPRs are required to support the input user/system SGPRs, + // increase to accommodate them. + // + // FIXME: This really ends up using the requested number of SGPRs + number + // of reserved special registers in total. Theoretically you could re-use + // the last input registers for these special registers, but this would + // require a lot of complexity to deal with the weird aliasing. + unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs(); + if (Requested && Requested < NumInputSGPRs) + Requested = NumInputSGPRs; + + // Make sure requested value is compatible with values implied by + // default/requested minimum/maximum number of waves per execution unit. + if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first, false)) + Requested = 0; + if (WavesPerEU.second && + Requested && Requested < getMinNumSGPRs(ST, WavesPerEU.second)) + Requested = 0; + + if (Requested) + MaxNumSGPRs = Requested; + } + + if (ST.hasSGPRInitBug()) + MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + + return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI), + MaxNumAddressableSGPRs); +} + +unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs( + const SISubtarget &ST) const { + if (ST.debuggerReserveRegs()) + return 4; + return 0; +} + +unsigned SIRegisterInfo::getMinNumVGPRs(unsigned WavesPerEU) const { + switch (WavesPerEU) { + case 0: return 0; + case 10: return 0; + case 9: return 25; + case 8: return 29; + case 7: return 33; + case 6: return 37; + case 5: return 41; + case 4: return 49; + case 3: return 65; + case 2: return 85; + default: return 129; + } +} + +unsigned SIRegisterInfo::getMaxNumVGPRs(unsigned WavesPerEU) const { + switch (WavesPerEU) { + case 0: return 24; + case 10: return 24; + case 9: return 28; + case 8: return 32; + case 7: return 36; + case 6: return 40; + case 5: return 48; + case 4: return 64; + case 3: return 84; + case 2: return 128; + default: return getTotalNumVGPRs(); + } +} + +unsigned SIRegisterInfo::getMaxNumVGPRs(const MachineFunction &MF) const { + const Function &F = *MF.getFunction(); + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + + // Compute maximum number of VGPRs function can use using default/requested + // minimum number of waves per execution unit. + std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); + unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); + + // Check if maximum number of VGPRs was explicitly requested using + // "amdgpu-num-vgpr" attribute. + if (F.hasFnAttribute("amdgpu-num-vgpr")) { + unsigned Requested = AMDGPU::getIntegerAttribute( + F, "amdgpu-num-vgpr", MaxNumVGPRs); + + // Make sure requested value does not violate subtarget's specifications. + if (Requested && Requested <= getNumDebuggerReservedVGPRs(ST)) + Requested = 0; + + // Make sure requested value is compatible with values implied by + // default/requested minimum/maximum number of waves per execution unit. + if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) + Requested = 0; + if (WavesPerEU.second && + Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) + Requested = 0; + + if (Requested) + MaxNumVGPRs = Requested; + } + + return MaxNumVGPRs - getNumDebuggerReservedVGPRs(ST); +} + +ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, + unsigned EltSize) const { + if (EltSize == 4) { + static const int16_t Sub0_15[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + }; + + static const int16_t Sub0_7[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + }; + + static const int16_t Sub0_3[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + }; + + static const int16_t Sub0_2[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, + }; + + static const int16_t Sub0_1[] = { + AMDGPU::sub0, AMDGPU::sub1, + }; + + switch (AMDGPU::getRegBitWidth(*RC->MC)) { + case 32: + return {}; + case 64: + return makeArrayRef(Sub0_1); + case 96: + return makeArrayRef(Sub0_2); + case 128: + return makeArrayRef(Sub0_3); + case 256: + return makeArrayRef(Sub0_7); + case 512: + return makeArrayRef(Sub0_15); + default: + llvm_unreachable("unhandled register size"); + } + } + + if (EltSize == 8) { + static const int16_t Sub0_15_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15 + }; + + static const int16_t Sub0_7_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7 + }; + + + static const int16_t Sub0_3_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3 + }; + + switch (AMDGPU::getRegBitWidth(*RC->MC)) { + case 64: + return {}; + case 128: + return makeArrayRef(Sub0_3_64); + case 256: + return makeArrayRef(Sub0_7_64); + case 512: + return makeArrayRef(Sub0_15_64); + default: + llvm_unreachable("unhandled register size"); + } + } + + assert(EltSize == 16 && "unhandled register spill split size"); + + static const int16_t Sub0_15_128[] = { + AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11, + AMDGPU::sub12_sub13_sub14_sub15 + }; + + static const int16_t Sub0_7_128[] = { + AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub4_sub5_sub6_sub7 + }; + + switch (AMDGPU::getRegBitWidth(*RC->MC)) { + case 128: + return {}; + case 256: + return makeArrayRef(Sub0_7_128); + case 512: + return makeArrayRef(Sub0_15_128); + default: + llvm_unreachable("unhandled register size"); + } +} + +const TargetRegisterClass* +SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, + unsigned Reg) const { if (TargetRegisterInfo::isVirtualRegister(Reg)) - RC = MRI.getRegClass(Reg); - else - RC = getPhysRegClass(Reg); + return MRI.getRegClass(Reg); - return hasVGPRs(RC); + return getPhysRegClass(Reg); +} + +bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, + unsigned Reg) const { + return hasVGPRs(getRegClassForReg(MRI, Reg)); } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index d8b2d9f..0bcae7d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -16,17 +16,19 @@ #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" +#include "SIDefines.h" #include "llvm/CodeGen/MachineRegisterInfo.h" namespace llvm { class SISubtarget; class MachineRegisterInfo; +class SIMachineFunctionInfo; -struct SIRegisterInfo final : public AMDGPURegisterInfo { +class SIRegisterInfo final : public AMDGPURegisterInfo { private: - unsigned SGPR32SetID; - unsigned VGPR32SetID; + unsigned SGPRSetID; + unsigned VGPRSetID; BitVector SGPRPressureSets; BitVector VGPRPressureSets; @@ -48,17 +50,16 @@ public: BitVector getReservedRegs(const MachineFunction &MF) const override; - unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const override; - - bool requiresRegisterScavenging(const MachineFunction &Fn) const override; - bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; + bool requiresFrameIndexReplacementScavenging( + const MachineFunction &MF) const override; bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override; bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override; + int64_t getMUBUFInstrOffset(const MachineInstr *MI) const; + int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override; @@ -77,6 +78,12 @@ public: const TargetRegisterClass *getPointerRegClass( const MachineFunction &MF, unsigned Kind = 0) const override; + void spillSGPR(MachineBasicBlock::iterator MI, + int FI, RegScavenger *RS) const; + + void restoreSGPR(MachineBasicBlock::iterator MI, + int FI, RegScavenger *RS) const; + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override; @@ -111,13 +118,6 @@ public: /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; - /// returns true if this is a pseudoregister class combination of VGPRs and - /// SGPRs for operand modeling. FIXME: We should set isAllocatable = 0 on - /// them. - static bool isPseudoRegClass(const TargetRegisterClass *RC) { - return RC == &AMDGPU::VS_32RegClass || RC == &AMDGPU::VS_64RegClass; - } - /// \returns A VGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentVGPRClass( const TargetRegisterClass *SRC) const; @@ -137,20 +137,21 @@ public: const TargetRegisterClass *SrcRC, unsigned SrcSubReg) const override; - /// \p Channel This is the register channel (e.g. a value from 0-16), not the - /// SubReg index. - /// \returns The sub-register of Reg that is in Channel. - unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, - unsigned Channel) const; - /// \returns True if operands defined with this operand type can accept /// a literal constant (i.e. any 32-bit immediate). - bool opCanUseLiteralConstant(unsigned OpType) const; + bool opCanUseLiteralConstant(unsigned OpType) const { + // TODO: 64-bit operands have extending behavior from 32-bit literal. + return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && + OpType <= AMDGPU::OPERAND_REG_IMM_LAST; + } /// \returns True if operands defined with this operand type can accept /// an inline constant. i.e. An integer value in the range (-16, 64) or /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. - bool opCanUseInlineConstant(unsigned OpType) const; + bool opCanUseInlineConstant(unsigned OpType) const { + return OpType >= AMDGPU::OPERAND_SRC_FIRST && + OpType <= AMDGPU::OPERAND_SRC_LAST; + } enum PreloadedValue { // SGPRS: @@ -176,29 +177,104 @@ public: unsigned getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const; - /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount - /// concurrent waves. - unsigned getNumVGPRsAllowed(unsigned WaveCount) const; - - /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount - /// concurrent waves. - unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const; - unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF) const; - unsigned getSGPR32PressureSet() const { return SGPR32SetID; }; - unsigned getVGPR32PressureSet() const { return VGPR32SetID; }; + unsigned getSGPRPressureSet() const { return SGPRSetID; }; + unsigned getVGPRPressureSet() const { return VGPRSetID; }; + const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI, + unsigned Reg) const; bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + bool isSGPRPressureSet(unsigned SetID) const { + return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID); + } + bool isVGPRPressureSet(unsigned SetID) const { + return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID); + } + + /// \returns SGPR allocation granularity supported by the subtarget. + unsigned getSGPRAllocGranule() const { + return 8; + } + + /// \returns Total number of SGPRs supported by the subtarget. + unsigned getTotalNumSGPRs(const SISubtarget &ST) const; + + /// \returns Number of addressable SGPRs supported by the subtarget. + unsigned getNumAddressableSGPRs(const SISubtarget &ST) const; + + /// \returns Number of reserved SGPRs supported by the subtarget. + unsigned getNumReservedSGPRs(const SISubtarget &ST, + const SIMachineFunctionInfo &MFI) const; + + /// \returns Minimum number of SGPRs that meets given number of waves per + /// execution unit requirement for given subtarget. + unsigned getMinNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const; + + /// \returns Maximum number of SGPRs that meets given number of waves per + /// execution unit requirement for given subtarget. + unsigned getMaxNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU, + bool Addressable) const; + + /// \returns Maximum number of SGPRs that meets number of waves per execution + /// unit requirement for function \p MF, or number of SGPRs explicitly + /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. + /// + /// \returns Value that meets number of waves per execution unit requirement + /// if explicitly requested value cannot be converted to integer, violates + /// subtarget's specifications, or does not meet number of waves per execution + /// unit requirement. + unsigned getMaxNumSGPRs(const MachineFunction &MF) const; + + /// \returns VGPR allocation granularity supported by the subtarget. + unsigned getVGPRAllocGranule() const { + return 4; + } + + /// \returns Total number of VGPRs supported by the subtarget. + unsigned getTotalNumVGPRs() const { + return 256; + } + + /// \returns Number of reserved VGPRs for debugger use supported by the + /// subtarget. + unsigned getNumDebuggerReservedVGPRs(const SISubtarget &ST) const; + + /// \returns Minimum number of SGPRs that meets given number of waves per + /// execution unit requirement. + unsigned getMinNumVGPRs(unsigned WavesPerEU) const; + + /// \returns Maximum number of VGPRs that meets given number of waves per + /// execution unit requirement. + unsigned getMaxNumVGPRs(unsigned WavesPerEU) const; + + /// \returns Maximum number of VGPRs that meets number of waves per execution + /// unit requirement for function \p MF, or number of VGPRs explicitly + /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. + /// + /// \returns Value that meets number of waves per execution unit requirement + /// if explicitly requested value cannot be converted to integer, violates + /// subtarget's specifications, or does not meet number of waves per execution + /// unit requirement. + unsigned getMaxNumVGPRs(const MachineFunction &MF) const; + + ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC, + unsigned EltSize) const; + private: - void buildScratchLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, const MachineOperand *SrcDst, - unsigned ScratchRsrcReg, unsigned ScratchOffset, - int64_t Offset, - RegScavenger *RS) const; + void buildSpillLoadStore(MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, + int Index, + unsigned ValueReg, + bool ValueIsKill, + unsigned ScratchRsrcReg, + unsigned ScratchOffsetReg, + int64_t InstrOffset, + MachineMemOperand *MMO, + RegScavenger *RS) const; }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index c427874..31e714b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -120,12 +120,19 @@ def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> { let isAllocatable = 0; } +def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> { + let CopyCost = 1; + let isAllocatable = 0; +} + // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add (sequence "SGPR%u", 0, 103))> { - let AllocationPriority = 1; + // Give all SGPR classes higher priority than VGPR classes, because + // we want to spill SGPRs to VGPRs. + let AllocationPriority = 7; } // SGPR 64-bit registers @@ -190,9 +197,10 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], (add (decimate (shl TTMP_32, 3), 4))]>; // VGPR 32-bit registers -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; + let Size = 32; } // VGPR 64-bit registers @@ -248,43 +256,51 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, // Register classes used as source and destination //===----------------------------------------------------------------------===// -class RegImmMatcher<string name> : AsmOperandClass { - let Name = name; - let RenderMethod = "addRegOrImmOperands"; -} - // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. -def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add SGPR_32, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI, +def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, + (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> { - let AllocationPriority = 1; + let AllocationPriority = 7; +} + +def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, + (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { + let AllocationPriority = 7; } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add SReg_32_XM0, M0)> { - let AllocationPriority = 1; +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, + (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> { + let AllocationPriority = 7; } def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> { - let AllocationPriority = 2; + let CopyCost = 1; + let AllocationPriority = 8; } def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> { let isAllocatable = 0; } +def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, + (add SGPR_64, VCC, FLAT_SCR, TTMP_64, TBA, TMA)> { + let CopyCost = 1; + let AllocationPriority = 8; +} + def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, - (add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64, TBA, TMA)> { - let AllocationPriority = 2; + (add SReg_64_XEXEC, EXEC)> { + let CopyCost = 1; + let AllocationPriority = 8; } // Requires 2 s_mov_b64 to copy let CopyCost = 2 in { def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> { - let AllocationPriority = 4; + let AllocationPriority = 10; } def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> { @@ -292,7 +308,7 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R } def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> { - let AllocationPriority = 4; + let AllocationPriority = 10; } } // End CopyCost = 2 @@ -300,17 +316,19 @@ def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; - let AllocationPriority = 5; + let AllocationPriority = 11; } def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; - let AllocationPriority = 6; + let AllocationPriority = 12; } // Register class for all vector registers (VGPRs + Interploation Registers) def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> { + let Size = 64; + // Requires 2 v_mov_b32 to copy let CopyCost = 2; let AllocationPriority = 2; @@ -325,17 +343,21 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { } def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> { + let Size = 128; + // Requires 4 v_mov_b32 to copy let CopyCost = 4; let AllocationPriority = 4; } def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> { + let Size = 256; let CopyCost = 8; let AllocationPriority = 5; } def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> { + let Size = 512; let CopyCost = 16; let AllocationPriority = 6; } @@ -344,80 +366,100 @@ def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { let Size = 32; } -class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_IMM32"; +def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, + (add VGPR_32, SReg_32)> { + let isAllocatable = 0; } -class RegInlineOperand <RegisterClass rc> : RegisterOperand<rc> { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_INLINE_C"; +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { + let isAllocatable = 0; } //===----------------------------------------------------------------------===// -// SSrc_* Operands with an SGPR or a 32-bit immediate +// Register operands //===----------------------------------------------------------------------===// -def SSrc_32 : RegImmOperand<SReg_32> { - let ParserMatchClass = RegImmMatcher<"SSrc32">; +class RegImmMatcher<string name> : AsmOperandClass { + let Name = name; + let RenderMethod = "addRegOrImmOperands"; } -def SSrc_64 : RegImmOperand<SReg_64> { - let ParserMatchClass = RegImmMatcher<"SSrc64">; +multiclass SIRegOperand <string rc, string MatchName, string opType> { + let OperandNamespace = "AMDGPU" in { + def _b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + let OperandType = opType#"_INT16"; + let ParserMatchClass = RegImmMatcher<MatchName#"B16">; + let DecoderMethod = "decodeOperand_VSrc16"; + } + + def _f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + let OperandType = opType#"_FP16"; + let ParserMatchClass = RegImmMatcher<MatchName#"F16">; + let DecoderMethod = "decodeOperand_VSrc16"; + } + + def _b32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + let OperandType = opType#"_INT32"; + let ParserMatchClass = RegImmMatcher<MatchName#"B32">; + } + + def _f32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + let OperandType = opType#"_FP32"; + let ParserMatchClass = RegImmMatcher<MatchName#"F32">; + } + + def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> { + let OperandType = opType#"_INT64"; + let ParserMatchClass = RegImmMatcher<MatchName#"B64">; + } + + def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> { + let OperandType = opType#"_FP64"; + let ParserMatchClass = RegImmMatcher<MatchName#"F64">; + } + } } +// FIXME: 64-bit sources can sometimes use 32-bit constants. +multiclass RegImmOperand <string rc, string MatchName> + : SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">; + +multiclass RegInlineOperand <string rc, string MatchName> + : SIRegOperand<rc, MatchName, "OPERAND_REG_INLINE_C">; + //===----------------------------------------------------------------------===// -// SCSrc_* Operands with an SGPR or a inline constant +// SSrc_* Operands with an SGPR or a 32-bit immediate //===----------------------------------------------------------------------===// -def SCSrc_32 : RegInlineOperand<SReg_32> { - let ParserMatchClass = RegImmMatcher<"SCSrc32">; -} +defm SSrc : RegImmOperand<"SReg", "SSrc">; //===----------------------------------------------------------------------===// -// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate +// SCSrc_* Operands with an SGPR or a inline constant //===----------------------------------------------------------------------===// -def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; +defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ; -def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { - let CopyCost = 2; -} +//===----------------------------------------------------------------------===// +// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate +//===----------------------------------------------------------------------===// -def VSrc_32 : RegisterOperand<VS_32> { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_IMM32"; - let ParserMatchClass = RegImmMatcher<"VSrc32">; -} +defm VSrc : RegImmOperand<"VS", "VSrc">; -def VSrc_64 : RegisterOperand<VS_64> { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_IMM32"; - let ParserMatchClass = RegImmMatcher<"VSrc64">; -} +def VSrc_128 : RegisterOperand<VReg_128>; //===----------------------------------------------------------------------===// -// VCSrc_* Operands with an SGPR, VGPR or an inline constant +// VSrc_* Operands with an VGPR //===----------------------------------------------------------------------===// -def VCSrc_32 : RegisterOperand<VS_32> { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_INLINE_C"; - let ParserMatchClass = RegImmMatcher<"VCSrc32">; -} - -def VCSrc_64 : RegisterOperand<VS_64> { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_INLINE_C"; - let ParserMatchClass = RegImmMatcher<"VCSrc64">; +// This is for operands with the enum(9), VSrc encoding restriction, +// but only allows VGPRs. +def VRegSrc_32 : RegisterOperand<VGPR_32> { + //let ParserMatchClass = RegImmMatcher<"VRegSrc32">; + let DecoderMethod = "DecodeVS_32RegisterClass"; } //===----------------------------------------------------------------------===// -// SCSrc_* Operands with an SGPR or an inline constant +// VCSrc_* Operands with an SGPR, VGPR or an inline constant //===----------------------------------------------------------------------===// -def SCSrc_64 : RegisterOperand<SReg_64> { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_INLINE_C"; - let ParserMatchClass = RegImmMatcher<"SCSrc64">; -} +defm VCSrc : RegInlineOperand<"VS", "VCSrc">; diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td index ed19217..be27966 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td @@ -46,7 +46,11 @@ def Write64Bit : SchedWrite; // instructions) class SISchedMachineModel : SchedMachineModel { - let CompleteModel = 0; + let CompleteModel = 1; + // MicroOpBufferSize = 1 means that instructions will always be added + // the ready queue when they become available. This exposes them + // to the register pressure analysis. + let MicroOpBufferSize = 1; let IssueWidth = 1; let PostRAScheduler = 1; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 6cba553..dd31dc6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -45,9 +45,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { - return "SI Shrink Instructions"; - } + StringRef getPassName() const override { return "SI Shrink Instructions"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -86,13 +84,19 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add // a special case for it. It can only be shrunk if the third operand // is vcc. We should handle this the same way we handle vopc, by addding - // a register allocation hint pre-regalloc and then do the shrining + // a register allocation hint pre-regalloc and then do the shrinking // post-regalloc. if (Src2) { switch (MI.getOpcode()) { default: return false; + case AMDGPU::V_ADDC_U32_e64: + case AMDGPU::V_SUBB_U32_e64: + // Additional verification is needed for sdst/src2. + return true; + case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_MAC_F16_e64: if (!isVGPR(Src2, TRI, MRI) || TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) return false; @@ -134,23 +138,15 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); // Only one literal constant is allowed per instruction, so if src0 is a // literal constant then we can't do any folding. - if (Src0.isImm() && - TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx))) - return; - - // Literal constants and SGPRs can only be used in Src0, so if Src0 is an - // SGPR, we cannot commute the instruction, so we can't fold any literal - // constants. - if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI)) + if (TII->isLiteralConstant(MI, Src0Idx)) return; // Try to fold Src0 + MachineOperand &Src0 = MI.getOperand(Src0Idx); if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) { unsigned Reg = Src0.getReg(); MachineInstr *Def = MRI.getUniqueVRegDef(Reg); @@ -158,7 +154,8 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, MachineOperand &MovSrc = Def->getOperand(1); bool ConstantFolded = false; - if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) { + if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) || + isUInt<32>(MovSrc.getImm()))) { Src0.ChangeToImmediate(MovSrc.getImm()); ConstantFolded = true; } @@ -182,7 +179,7 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig) { for (MachineOperand &Use : MI.implicit_operands()) { - if (Use.getReg() == AMDGPU::VCC) { + if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { Use.setIsUndef(Orig.isUndef()); Use.setIsKill(Orig.isKill()); return; @@ -191,7 +188,95 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI, } static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { - return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4); + return isInt<16>(Src.getImm()) && + !TII->isInlineConstant(*Src.getParent(), + Src.getParent()->getOperandNo(&Src)); +} + +static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { + return isUInt<16>(Src.getImm()) && + !TII->isInlineConstant(*Src.getParent(), + Src.getParent()->getOperandNo(&Src)); +} + +static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, + const MachineOperand &Src, + bool &IsUnsigned) { + if (isInt<16>(Src.getImm())) { + IsUnsigned = false; + return !TII->isInlineConstant(Src); + } + + if (isUInt<16>(Src.getImm())) { + IsUnsigned = true; + return !TII->isInlineConstant(Src); + } + + return false; +} + +/// \returns true if the constant in \p Src should be replaced with a bitreverse +/// of an inline immediate. +static bool isReverseInlineImm(const SIInstrInfo *TII, + const MachineOperand &Src, + int32_t &ReverseImm) { + if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) + return false; + + ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); + return ReverseImm >= -16 && ReverseImm <= 64; +} + +/// Copy implicit register operands from specified instruction to this +/// instruction that are not part of the instruction definition. +static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF, + const MachineInstr &MI) { + for (unsigned i = MI.getDesc().getNumOperands() + + MI.getDesc().getNumImplicitUses() + + MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); + i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); + if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) + NewMI.addOperand(MF, MO); + } +} + +static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { + // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to + // get constants on the RHS. + if (!MI.getOperand(0).isReg()) + TII->commuteInstruction(MI, false, 0, 1); + + const MachineOperand &Src1 = MI.getOperand(1); + if (!Src1.isImm()) + return; + + int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); + if (SOPKOpc == -1) + return; + + // eq/ne is special because the imm16 can be treated as signed or unsigned, + // and initially selectd to the unsigned versions. + if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { + bool HasUImm; + if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) { + if (!HasUImm) { + SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? + AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; + } + + MI.setDesc(TII->get(SOPKOpc)); + } + + return; + } + + const MCInstrDesc &NewDesc = TII->get(SOPKOpc); + + if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) || + (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) { + MI.setDesc(NewDesc); + } } bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { @@ -226,14 +311,11 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineOperand &Src = MI.getOperand(1); if (Src.isImm() && TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) { - int64_t Imm = Src.getImm(); - if (isInt<32>(Imm) && !TII->isInlineConstant(Src, 4)) { - int32_t ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Imm)); - if (ReverseImm >= -16 && ReverseImm <= 64) { - MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); - Src.setImm(ReverseImm); - continue; - } + int32_t ReverseImm; + if (isReverseInlineImm(TII, Src, ReverseImm)) { + MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); + Src.setImm(ReverseImm); + continue; } } } @@ -272,21 +354,27 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // satisfied. if (MI.getOpcode() == AMDGPU::S_ADD_I32 || MI.getOpcode() == AMDGPU::S_MUL_I32) { - const MachineOperand &Dest = MI.getOperand(0); - const MachineOperand &Src0 = MI.getOperand(1); - const MachineOperand &Src1 = MI.getOperand(2); + const MachineOperand *Dest = &MI.getOperand(0); + MachineOperand *Src0 = &MI.getOperand(1); + MachineOperand *Src1 = &MI.getOperand(2); + + if (!Src0->isReg() && Src1->isReg()) { + if (TII->commuteInstruction(MI, false, 1, 2)) + std::swap(Src0, Src1); + } // FIXME: This could work better if hints worked with subregisters. If // we have a vector add of a constant, we usually don't get the correct // allocation due to the subregister usage. - if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) && - Src0.isReg()) { - MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg()); + if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && + Src0->isReg()) { + MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); + MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); continue; } - if (Src0.isReg() && Src0.getReg() == Dest.getReg()) { - if (Src1.isImm() && isKImmOperand(TII, Src1)) { + if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { + if (Src1->isImm() && isKImmOperand(TII, *Src1)) { unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; @@ -296,12 +384,27 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { } } + // Try to use s_cmpk_* + if (MI.isCompare() && TII->isSOPC(MI)) { + shrinkScalarCompare(TII, MI); + continue; + } + // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. if (MI.getOpcode() == AMDGPU::S_MOV_B32) { - const MachineOperand &Src = MI.getOperand(1); + const MachineOperand &Dst = MI.getOperand(0); + MachineOperand &Src = MI.getOperand(1); - if (Src.isImm() && isKImmOperand(TII, Src)) - MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); + if (Src.isImm() && + TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) { + int32_t ReverseImm; + if (isKImmOperand(TII, Src)) + MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); + else if (isReverseInlineImm(TII, Src, ReverseImm)) { + MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); + Src.setImm(ReverseImm); + } + } continue; } @@ -358,6 +461,31 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } + // Check for the bool flag output for instructions like V_ADD_I32_e64. + const MachineOperand *SDst = TII->getNamedOperand(MI, + AMDGPU::OpName::sdst); + + // Check the carry-in operand for v_addc_u32_e64. + const MachineOperand *Src2 = TII->getNamedOperand(MI, + AMDGPU::OpName::src2); + + if (SDst) { + if (SDst->getReg() != AMDGPU::VCC) { + if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) + MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC); + continue; + } + + // All of the instructions with carry outs also have an SGPR input in + // src2. + if (Src2 && Src2->getReg() != AMDGPU::VCC) { + if (TargetRegisterInfo::isVirtualRegister(Src2->getReg())) + MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC); + + continue; + } + } + // We can shrink this instruction DEBUG(dbgs() << "Shrinking " << MI); @@ -383,8 +511,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (Src1) Inst32.addOperand(*Src1); - const MachineOperand *Src2 = - TII->getNamedOperand(MI, AMDGPU::OpName::src2); if (Src2) { int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); if (Op32Src2Idx != -1) { @@ -398,9 +524,13 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { } ++NumInstructionsShrunk; - MI.eraseFromParent(); + // Copy extra operands not present in the instruction definition. + copyExtraImplicitOps(*Inst32, MF, MI); + + MI.eraseFromParent(); foldImmediates(*Inst32, TII, MRI); + DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp index facc0c7..aad6853 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp @@ -42,9 +42,7 @@ public: SITypeRewriter() : FunctionPass(ID) { } bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; - const char *getPassName() const override { - return "SI Type Rewriter"; - } + StringRef getPassName() const override { return "SI Type Rewriter"; } void visitLoadInst(LoadInst &I); void visitCallInst(CallInst &I); void visitBitCast(BitCastInst &I); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 1534d58..a613a22 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -53,10 +53,28 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include <cassert> +#include <vector> using namespace llvm; @@ -69,6 +87,25 @@ enum { StateExact = 0x2, }; +struct PrintState { +public: + int State; + + explicit PrintState(int State) : State(State) {} +}; + +static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { + if (PS.State & StateWQM) + OS << "WQM"; + if (PS.State & StateExact) { + if (PS.State & StateWQM) + OS << '|'; + OS << "Exact"; + } + + return OS; +} + struct InstrInfo { char Needs = 0; char OutNeeds = 0; @@ -84,7 +121,7 @@ struct WorkItem { MachineBasicBlock *MBB = nullptr; MachineInstr *MI = nullptr; - WorkItem() {} + WorkItem() = default; WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} WorkItem(MachineInstr *MI) : MI(MI) {} }; @@ -98,16 +135,26 @@ private: DenseMap<const MachineInstr *, InstrInfo> Instructions; DenseMap<MachineBasicBlock *, BlockInfo> Blocks; - SmallVector<const MachineInstr *, 2> ExecExports; SmallVector<MachineInstr *, 1> LiveMaskQueries; + void printInfo(); + void markInstruction(MachineInstr &MI, char Flag, std::vector<WorkItem> &Worklist); + void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist); char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); char analyzeFunction(MachineFunction &MF); + bool requiresCorrectState(const MachineInstr &MI) const; + + MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before); + MachineBasicBlock::iterator + prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, + MachineBasicBlock::iterator Last, bool PreferLast, + bool SaveSCC); void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, @@ -124,9 +171,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { - return "SI Whole Quad Mode"; - } + StringRef getPassName() const override { return "SI Whole Quad Mode"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LiveIntervals>(); @@ -135,7 +180,7 @@ public: } }; -} // End anonymous namespace +} // end anonymous namespace char SIWholeQuadMode::ID = 0; @@ -151,6 +196,24 @@ FunctionPass *llvm::createSIWholeQuadModePass() { return new SIWholeQuadMode; } +void SIWholeQuadMode::printInfo() { + for (const auto &BII : Blocks) { + dbgs() << "\nBB#" << BII.first->getNumber() << ":\n" + << " InNeeds = " << PrintState(BII.second.InNeeds) + << ", Needs = " << PrintState(BII.second.Needs) + << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n"; + + for (const MachineInstr &MI : *BII.first) { + auto III = Instructions.find(&MI); + if (III == Instructions.end()) + continue; + + dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) + << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; + } + } +} + void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, std::vector<WorkItem> &Worklist) { InstrInfo &II = Instructions[&MI]; @@ -168,6 +231,45 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, Worklist.push_back(&MI); } +/// Mark all instructions defining the uses in \p MI as WQM. +void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, + std::vector<WorkItem> &Worklist) { + for (const MachineOperand &Use : MI.uses()) { + if (!Use.isReg() || !Use.isUse()) + continue; + + unsigned Reg = Use.getReg(); + + // Handle physical registers that we need to track; this is mostly relevant + // for VCC, which can appear as the (implicit) input of a uniform branch, + // e.g. when a loop counter is stored in a VGPR. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Reg == AMDGPU::EXEC) + continue; + + for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) { + LiveRange &LR = LIS->getRegUnit(*RegUnit); + const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); + if (!Value) + continue; + + // Since we're in machine SSA, we do not need to track physical + // registers across basic blocks. + if (Value->isPHIDef()) + continue; + + markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, + Worklist); + } + + continue; + } + + for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) + markInstruction(DefMI, StateWQM, Worklist); + } +} + // Scan instructions to determine which ones require an Exact execmask and // which ones seed WQM requirements. char SIWholeQuadMode::scanInstructions(MachineFunction &MF, @@ -183,16 +285,19 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, unsigned Opcode = MI.getOpcode(); char Flags = 0; - if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { + if (TII->isDS(Opcode)) { Flags = StateWQM; + } else if (TII->isWQM(Opcode)) { + // Sampling instructions don't need to produce results for all pixels + // in a quad, they just require all inputs of a quad to have been + // computed for derivatives. + markUsesWQM(MI, Worklist); + GlobalFlags |= StateWQM; + continue; } else if (TII->isDisableWQM(MI)) { Flags = StateExact; } else { - // Handle export instructions with the exec mask valid flag set - if (Opcode == AMDGPU::EXP) { - if (MI.getOperand(4).getImm() != 0) - ExecExports.push_back(&MI); - } else if (Opcode == AMDGPU::SI_PS_LIVE) { + if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical @@ -259,43 +364,9 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, // Propagate WQM flag to instruction inputs assert(II.Needs != (StateWQM | StateExact)); - if (II.Needs != StateWQM) - return; - - for (const MachineOperand &Use : MI.uses()) { - if (!Use.isReg() || !Use.isUse()) - continue; - - unsigned Reg = Use.getReg(); - - // Handle physical registers that we need to track; this is mostly relevant - // for VCC, which can appear as the (implicit) input of a uniform branch, - // e.g. when a loop counter is stored in a VGPR. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { - if (Reg == AMDGPU::EXEC) - continue; - for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) { - LiveRange &LR = LIS->getRegUnit(*RegUnit); - const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); - if (!Value) - continue; - - // Since we're in machine SSA, we do not need to track physical - // registers across basic blocks. - if (Value->isPHIDef()) - continue; - - markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, - Worklist); - } - - continue; - } - - for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) - markInstruction(DefMI, StateWQM, Worklist); - } + if (II.Needs == StateWQM) + markUsesWQM(MI, Worklist); } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, @@ -351,32 +422,140 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { return GlobalFlags; } +/// Whether \p MI really requires the exec state computed during analysis. +/// +/// Scalar instructions must occasionally be marked WQM for correct propagation +/// (e.g. thread masks leading up to branches), but when it comes to actual +/// execution, they don't care about EXEC. +bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const { + if (MI.isTerminator()) + return true; + + // Skip instructions that are not affected by EXEC + if (TII->isScalarUnit(MI)) + return false; + + // Generic instructions such as COPY will either disappear by register + // coalescing or be lowered to SALU or VALU instructions. + if (MI.isTransient()) { + if (MI.getNumExplicitOperands() >= 1) { + const MachineOperand &Op = MI.getOperand(0); + if (Op.isReg()) { + if (TRI->isSGPRReg(*MRI, Op.getReg())) { + // SGPR instructions are not affected by EXEC + return false; + } + } + } + } + + return true; +} + +MachineBasicBlock::iterator +SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before) { + unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + MachineInstr *Save = + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg) + .addReg(AMDGPU::SCC); + MachineInstr *Restore = + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(SaveReg); + + LIS->InsertMachineInstrInMaps(*Save); + LIS->InsertMachineInstrInMaps(*Restore); + LIS->createAndComputeVirtRegInterval(SaveReg); + + return Restore; +} + +// Return an iterator in the (inclusive) range [First, Last] at which +// instructions can be safely inserted, keeping in mind that some of the +// instructions we want to add necessarily clobber SCC. +MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( + MachineBasicBlock &MBB, MachineBasicBlock::iterator First, + MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { + if (!SaveSCC) + return PreferLast ? Last : First; + + LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI)); + auto MBBE = MBB.end(); + SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First) + : LIS->getMBBEndIdx(&MBB); + SlotIndex LastIdx = + Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB); + SlotIndex Idx = PreferLast ? LastIdx : FirstIdx; + const LiveRange::Segment *S; + + for (;;) { + S = LR.getSegmentContaining(Idx); + if (!S) + break; + + if (PreferLast) { + SlotIndex Next = S->start.getBaseIndex(); + if (Next < FirstIdx) + break; + Idx = Next; + } else { + SlotIndex Next = S->end.getNextIndex().getBaseIndex(); + if (Next > LastIdx) + break; + Idx = Next; + } + } + + MachineBasicBlock::iterator MBBI; + + if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) + MBBI = MI; + else { + assert(Idx == LIS->getMBBEndIdx(&MBB)); + MBBI = MBB.end(); + } + + if (S) + MBBI = saveSCC(MBB, MBBI); + + return MBBI; +} + void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg) { + MachineInstr *MI; + if (SaveWQM) { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), - SaveWQM) - .addReg(LiveMaskReg); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), + SaveWQM) + .addReg(LiveMaskReg); } else { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(LiveMaskReg); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(LiveMaskReg); } + + LIS->InsertMachineInstrInMaps(*MI); } void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SavedWQM) { + MachineInstr *MI; + if (SavedWQM) { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) - .addReg(SavedWQM); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) + .addReg(SavedWQM); } else { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); } + + LIS->InsertMachineInstrInMaps(*MI); } void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, @@ -395,72 +574,82 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) return; + DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n"); + unsigned SavedWQMReg = 0; bool WQMFromExec = isEntry; char State = isEntry ? StateExact : StateWQM; auto II = MBB.getFirstNonPHI(), IE = MBB.end(); - while (II != IE) { - MachineInstr &MI = *II; - ++II; + if (isEntry) + ++II; // Skip the instruction that saves LiveMask - // Skip instructions that are not affected by EXEC - if (TII->isScalarUnit(MI) && !MI.isTerminator()) - continue; + MachineBasicBlock::iterator First = IE; + for (;;) { + MachineBasicBlock::iterator Next = II; + char Needs = 0; + char OutNeeds = 0; - // Generic instructions such as COPY will either disappear by register - // coalescing or be lowered to SALU or VALU instructions. - if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) { - if (MI.getNumExplicitOperands() >= 1) { - const MachineOperand &Op = MI.getOperand(0); - if (Op.isReg()) { - if (TRI->isSGPRReg(*MRI, Op.getReg())) { - // SGPR instructions are not affected by EXEC - continue; - } + if (First == IE) + First = II; + + if (II != IE) { + MachineInstr &MI = *II; + + if (requiresCorrectState(MI)) { + auto III = Instructions.find(&MI); + if (III != Instructions.end()) { + Needs = III->second.Needs; + OutNeeds = III->second.OutNeeds; } } - } - char Needs = 0; - char OutNeeds = 0; - auto InstrInfoIt = Instructions.find(&MI); - if (InstrInfoIt != Instructions.end()) { - Needs = InstrInfoIt->second.Needs; - OutNeeds = InstrInfoIt->second.OutNeeds; - - // Make sure to switch to Exact mode before the end of the block when - // Exact and only Exact is needed further downstream. - if (OutNeeds == StateExact && MI.isTerminator()) { - assert(Needs == 0); + if (MI.isTerminator() && !Needs && OutNeeds == StateExact) + Needs = StateExact; + + if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact) + MI.getOperand(3).setImm(1); + + ++Next; + } else { + // End of basic block + if (BI.OutNeeds & StateWQM) + Needs = StateWQM; + else if (BI.OutNeeds == StateExact) Needs = StateExact; - } } - // State switching - if (Needs && State != Needs) { - if (Needs == StateExact) { - assert(!SavedWQMReg); + if (Needs) { + if (Needs != State) { + MachineBasicBlock::iterator Before = + prepareInsertion(MBB, First, II, Needs == StateWQM, + Needs == StateExact || WQMFromExec); - if (!WQMFromExec && (OutNeeds & StateWQM)) - SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + if (Needs == StateExact) { + if (!WQMFromExec && (OutNeeds & StateWQM)) + SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); - toExact(MBB, &MI, SavedWQMReg, LiveMaskReg); - } else { - assert(WQMFromExec == (SavedWQMReg == 0)); - toWQM(MBB, &MI, SavedWQMReg); - SavedWQMReg = 0; + toExact(MBB, Before, SavedWQMReg, LiveMaskReg); + } else { + assert(WQMFromExec == (SavedWQMReg == 0)); + + toWQM(MBB, Before, SavedWQMReg); + + if (SavedWQMReg) { + LIS->createAndComputeVirtRegInterval(SavedWQMReg); + SavedWQMReg = 0; + } + } + + State = Needs; } - State = Needs; + First = IE; } - } - if ((BI.OutNeeds & StateWQM) && State != StateWQM) { - assert(WQMFromExec == (SavedWQMReg == 0)); - toWQM(MBB, MBB.end(), SavedWQMReg); - } else if (BI.OutNeeds == StateExact && State != StateExact) { - toExact(MBB, MBB.end(), 0, LiveMaskReg); + if (II == IE) + break; + II = Next; } } @@ -468,8 +657,11 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); unsigned Dest = MI->getOperand(0).getReg(); - BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) - .addReg(LiveMaskReg); + MachineInstr *Copy = + BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) + .addReg(LiveMaskReg); + + LIS->ReplaceMachineInstrInMaps(*MI, *Copy); MI->eraseFromParent(); } } @@ -480,7 +672,6 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { Instructions.clear(); Blocks.clear(); - ExecExports.clear(); LiveMaskQueries.clear(); const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); @@ -504,8 +695,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(AMDGPU::EXEC); + MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), + TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(AMDGPU::EXEC); + LIS->InsertMachineInstrInMaps(*MI); } if (GlobalFlags == StateWQM) { @@ -520,11 +713,18 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { } } + DEBUG(printInfo()); + lowerLiveMaskQueries(LiveMaskReg); // Handle the general case for (auto BII : Blocks) processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + // Physical registers like SCC aren't tracked by default anyway, so just + // removing the ranges we computed is the simplest option for maintaining + // the analysis results. + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI)); + return true; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td new file mode 100644 index 0000000..0265648 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -0,0 +1,535 @@ +//===---- SMInstructions.td - Scalar Memory Instruction Defintions --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def smrd_offset_8 : NamedOperandU32<"SMRDOffset8", + NamedMatchClass<"SMRDOffset8">> { + let OperandType = "OPERAND_IMMEDIATE"; +} + +def smrd_offset_20 : NamedOperandU32<"SMRDOffset20", + NamedMatchClass<"SMRDOffset20">> { + let OperandType = "OPERAND_IMMEDIATE"; +} + +//===----------------------------------------------------------------------===// +// Scalar Memory classes +//===----------------------------------------------------------------------===// + +class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> : + InstSI <outs, ins, "", pattern>, + SIMCInstr<opName, SIEncodingFamily.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; + + let LGKM_CNT = 1; + let SMRD = 1; + let mayStore = 0; + let mayLoad = 1; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let SchedRW = [WriteSMEM]; + let SubtargetPredicate = isGCN; + + string Mnemonic = opName; + string AsmOperands = asmOps; + + bits<1> has_sbase = 1; + bits<1> has_sdst = 1; + bit has_glc = 0; + bits<1> has_offset = 1; + bits<1> offset_is_imm = 0; +} + +class SM_Real <SM_Pseudo ps> + : InstSI<ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + + // encoding + bits<7> sbase; + bits<7> sdst; + bits<32> offset; + bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0); +} + +class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> + : SM_Pseudo<opName, outs, ins, asmOps, pattern> { + RegisterClass BaseClass; + let mayLoad = 1; + let mayStore = 0; + let has_glc = 1; +} + +class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern = []> + : SM_Pseudo<opName, (outs), ins, asmOps, pattern> { + RegisterClass BaseClass; + RegisterClass SrcClass; + let mayLoad = 0; + let mayStore = 1; + let has_glc = 1; + let ScalarStore = 1; +} + +multiclass SM_Pseudo_Loads<string opName, + RegisterClass baseClass, + RegisterClass dstClass> { + def _IMM : SM_Load_Pseudo <opName, + (outs dstClass:$sdst), + (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc), + " $sdst, $sbase, $offset$glc", []> { + let offset_is_imm = 1; + let BaseClass = baseClass; + let PseudoInstr = opName # "_IMM"; + let has_glc = 1; + } + + def _SGPR : SM_Load_Pseudo <opName, + (outs dstClass:$sdst), + (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc), + " $sdst, $sbase, $offset$glc", []> { + let BaseClass = baseClass; + let PseudoInstr = opName # "_SGPR"; + let has_glc = 1; + } +} + +multiclass SM_Pseudo_Stores<string opName, + RegisterClass baseClass, + RegisterClass srcClass> { + def _IMM : SM_Store_Pseudo <opName, + (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc), + " $sdata, $sbase, $offset$glc", []> { + let offset_is_imm = 1; + let BaseClass = baseClass; + let SrcClass = srcClass; + let PseudoInstr = opName # "_IMM"; + } + + def _SGPR : SM_Store_Pseudo <opName, + (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc), + " $sdata, $sbase, $offset$glc", []> { + let BaseClass = baseClass; + let SrcClass = srcClass; + let PseudoInstr = opName # "_SGPR"; + } +} + +class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo< + opName, (outs SReg_64_XEXEC:$sdst), (ins), + " $sdst", [(set i64:$sdst, (node))]> { + let hasSideEffects = 1; + // FIXME: mayStore = ? is a workaround for tablegen bug for different + // inferred mayStore flags for the instruction pattern vs. standalone + // Pat. Each considers the other contradictory. + let mayStore = ?; + let mayLoad = ?; + let has_sbase = 0; + let has_offset = 0; +} + +class SM_Inval_Pseudo <string opName, SDPatternOperator node> : SM_Pseudo< + opName, (outs), (ins), "", [(node)]> { + let hasSideEffects = 1; + let mayStore = 1; + let has_sdst = 0; + let has_sbase = 0; + let has_offset = 0; +} + + +//===----------------------------------------------------------------------===// +// Scalar Memory Instructions +//===----------------------------------------------------------------------===// + +// We are using the SReg_32_XM0 and not the SReg_32 register class for 32-bit +// SMRD instructions, because the SReg_32_XM0 register class does not include M0 +// and writing to M0 from an SMRD instruction will hang the GPU. + +// XXX - SMEM instructions do not allow exec for data operand, but +// does sdst for SMRD on SI/CI? +defm S_LOAD_DWORD : SM_Pseudo_Loads <"s_load_dword", SReg_64, SReg_32_XM0_XEXEC>; +defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_load_dwordx2", SReg_64, SReg_64_XEXEC>; +defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_load_dwordx4", SReg_64, SReg_128>; +defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <"s_load_dwordx8", SReg_64, SReg_256>; +defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <"s_load_dwordx16", SReg_64, SReg_512>; + +defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads < + "s_buffer_load_dword", SReg_128, SReg_32_XM0_XEXEC +>; + +// FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on +// SI/CI, bit disallowed for SMEM on VI. +defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads < + "s_buffer_load_dwordx2", SReg_128, SReg_64_XEXEC +>; + +defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads < + "s_buffer_load_dwordx4", SReg_128, SReg_128 +>; + +defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads < + "s_buffer_load_dwordx8", SReg_128, SReg_256 +>; + +defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads < + "s_buffer_load_dwordx16", SReg_128, SReg_512 +>; + +defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>; +defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>; +defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>; + +defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores < + "s_buffer_store_dword", SReg_128, SReg_32_XM0_XEXEC +>; + +defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores < + "s_buffer_store_dwordx2", SReg_128, SReg_64_XEXEC +>; + +defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores < + "s_buffer_store_dwordx4", SReg_128, SReg_128 +>; + + +def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>; +def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>; + +let SubtargetPredicate = isCIVI in { +def S_DCACHE_INV_VOL : SM_Inval_Pseudo <"s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>; +} // let SubtargetPredicate = isCIVI + +let SubtargetPredicate = isVI in { +def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>; +def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>; +def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>; +} // SubtargetPredicate = isVI + + + +//===----------------------------------------------------------------------===// +// Scalar Memory Patterns +//===----------------------------------------------------------------------===// + + +def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ + auto Ld = cast<LoadSDNode>(N); + return Ld->getAlignment() >= 4 && + ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) || + (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) && + static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N))); +}]>; + +def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; +def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; +def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">; +def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; +def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; +def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">; + +let Predicates = [isGCN] in { + +multiclass SMRD_Pattern <string Instr, ValueType vt> { + + // 1. IMM offset + def : Pat < + (smrd_load (SMRDImm i64:$sbase, i32:$offset)), + (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0)) + >; + + // 2. SGPR offset + def : Pat < + (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0)) + >; +} + +let Predicates = [isSICI] in { +def : Pat < + (i64 (readcyclecounter)), + (S_MEMTIME) +>; +} + +// Global and constant loads can be selected to either MUBUF or SMRD +// instructions, but SMRD instructions are faster so we want the instruction +// selector to prefer those. +let AddedComplexity = 100 in { + +defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; + +// 1. Offset as an immediate +def SM_LOAD_PATTERN : Pat < // name this pattern to reuse AddedComplexity on CI + (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), + (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0) +>; + +// 2. Offset loaded in an 32bit SGPR +def : Pat < + (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)), + (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0) +>; + +} // End let AddedComplexity = 100 + +} // let Predicates = [isGCN] + +let Predicates = [isVI] in { + +// 1. Offset as 20bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM20bit:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset), 0) +>; + +def : Pat < + (i64 (readcyclecounter)), + (S_MEMREALTIME) +>; + +} // let Predicates = [isVI] + + +//===----------------------------------------------------------------------===// +// Targets +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SI +//===----------------------------------------------------------------------===// + +class SMRD_Real_si <bits<5> op, SM_Pseudo ps> + : SM_Real<ps> + , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> + , Enc32 { + + let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + + let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?); + let Inst{8} = imm; + let Inst{14-9} = !if(ps.has_sbase, sbase{6-1}, ?); + let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?); + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding +} + +// FIXME: Assembler should reject trying to use glc on SMRD +// instructions on SI. +multiclass SM_Real_Loads_si<bits<5> op, string ps, + SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM), + SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { + + def _IMM_si : SMRD_Real_si <op, immPs> { + let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc); + } + + // FIXME: The operand name $offset is inconsistent with $soff used + // in the pseudo + def _SGPR_si : SMRD_Real_si <op, sgprPs> { + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc); + } + +} + +defm S_LOAD_DWORD : SM_Real_Loads_si <0x00, "S_LOAD_DWORD">; +defm S_LOAD_DWORDX2 : SM_Real_Loads_si <0x01, "S_LOAD_DWORDX2">; +defm S_LOAD_DWORDX4 : SM_Real_Loads_si <0x02, "S_LOAD_DWORDX4">; +defm S_LOAD_DWORDX8 : SM_Real_Loads_si <0x03, "S_LOAD_DWORDX8">; +defm S_LOAD_DWORDX16 : SM_Real_Loads_si <0x04, "S_LOAD_DWORDX16">; +defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_si <0x08, "S_BUFFER_LOAD_DWORD">; +defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_si <0x09, "S_BUFFER_LOAD_DWORDX2">; +defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_si <0x0a, "S_BUFFER_LOAD_DWORDX4">; +defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_si <0x0b, "S_BUFFER_LOAD_DWORDX8">; +defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_si <0x0c, "S_BUFFER_LOAD_DWORDX16">; + +def S_MEMTIME_si : SMRD_Real_si <0x1e, S_MEMTIME>; +def S_DCACHE_INV_si : SMRD_Real_si <0x1f, S_DCACHE_INV>; + + +//===----------------------------------------------------------------------===// +// VI +//===----------------------------------------------------------------------===// + +class SMEM_Real_vi <bits<8> op, SM_Pseudo ps> + : SM_Real<ps> + , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> + , Enc64 { + bit glc; + + let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + + let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); + let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?); + + let Inst{16} = !if(ps.has_glc, glc, ?); + let Inst{17} = imm; + let Inst{25-18} = op; + let Inst{31-26} = 0x30; //encoding + let Inst{51-32} = !if(ps.has_offset, offset{19-0}, ?); +} + +multiclass SM_Real_Loads_vi<bits<8> op, string ps, + SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM), + SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { + def _IMM_vi : SMEM_Real_vi <op, immPs> { + let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc); + } + def _SGPR_vi : SMEM_Real_vi <op, sgprPs> { + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc); + } +} + +class SMEM_Real_Store_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> { + // encoding + bits<7> sdata; + + let sdst = ?; + let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?); +} + +multiclass SM_Real_Stores_vi<bits<8> op, string ps, + SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM), + SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> { + // FIXME: The operand name $offset is inconsistent with $soff used + // in the pseudo + def _IMM_vi : SMEM_Real_Store_vi <op, immPs> { + let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc); + } + + def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> { + let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc); + } +} + +defm S_LOAD_DWORD : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">; +defm S_LOAD_DWORDX2 : SM_Real_Loads_vi <0x01, "S_LOAD_DWORDX2">; +defm S_LOAD_DWORDX4 : SM_Real_Loads_vi <0x02, "S_LOAD_DWORDX4">; +defm S_LOAD_DWORDX8 : SM_Real_Loads_vi <0x03, "S_LOAD_DWORDX8">; +defm S_LOAD_DWORDX16 : SM_Real_Loads_vi <0x04, "S_LOAD_DWORDX16">; +defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_vi <0x08, "S_BUFFER_LOAD_DWORD">; +defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_vi <0x09, "S_BUFFER_LOAD_DWORDX2">; +defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_vi <0x0a, "S_BUFFER_LOAD_DWORDX4">; +defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_vi <0x0b, "S_BUFFER_LOAD_DWORDX8">; +defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_vi <0x0c, "S_BUFFER_LOAD_DWORDX16">; + +defm S_STORE_DWORD : SM_Real_Stores_vi <0x10, "S_STORE_DWORD">; +defm S_STORE_DWORDX2 : SM_Real_Stores_vi <0x11, "S_STORE_DWORDX2">; +defm S_STORE_DWORDX4 : SM_Real_Stores_vi <0x12, "S_STORE_DWORDX4">; + +defm S_BUFFER_STORE_DWORD : SM_Real_Stores_vi <0x18, "S_BUFFER_STORE_DWORD">; +defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_vi <0x19, "S_BUFFER_STORE_DWORDX2">; +defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_vi <0x1a, "S_BUFFER_STORE_DWORDX4">; + +// These instructions use same encoding +def S_DCACHE_INV_vi : SMEM_Real_vi <0x20, S_DCACHE_INV>; +def S_DCACHE_WB_vi : SMEM_Real_vi <0x21, S_DCACHE_WB>; +def S_DCACHE_INV_VOL_vi : SMEM_Real_vi <0x22, S_DCACHE_INV_VOL>; +def S_DCACHE_WB_VOL_vi : SMEM_Real_vi <0x23, S_DCACHE_WB_VOL>; +def S_MEMTIME_vi : SMEM_Real_vi <0x24, S_MEMTIME>; +def S_MEMREALTIME_vi : SMEM_Real_vi <0x25, S_MEMREALTIME>; + + +//===----------------------------------------------------------------------===// +// CI +//===----------------------------------------------------------------------===// + +def smrd_literal_offset : NamedOperandU32<"SMRDLiteralOffset", + NamedMatchClass<"SMRDLiteralOffset">> { + let OperandType = "OPERAND_IMMEDIATE"; +} + +class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> : + SM_Real<ps>, + Enc64 { + + let AssemblerPredicates = [isCIOnly]; + let DecoderNamespace = "CI"; + let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc); + + let LGKM_CNT = ps.LGKM_CNT; + let SMRD = ps.SMRD; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let hasSideEffects = ps.hasSideEffects; + let SchedRW = ps.SchedRW; + let UseNamedOperandTable = ps.UseNamedOperandTable; + + let Inst{7-0} = 0xff; + let Inst{8} = 0; + let Inst{14-9} = sbase{6-1}; + let Inst{21-15} = sdst{6-0}; + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding + let Inst{63-32} = offset{31-0}; +} + +def S_LOAD_DWORD_IMM_ci : SMRD_Real_Load_IMM_ci <0x00, S_LOAD_DWORD_IMM>; +def S_LOAD_DWORDX2_IMM_ci : SMRD_Real_Load_IMM_ci <0x01, S_LOAD_DWORDX2_IMM>; +def S_LOAD_DWORDX4_IMM_ci : SMRD_Real_Load_IMM_ci <0x02, S_LOAD_DWORDX4_IMM>; +def S_LOAD_DWORDX8_IMM_ci : SMRD_Real_Load_IMM_ci <0x03, S_LOAD_DWORDX8_IMM>; +def S_LOAD_DWORDX16_IMM_ci : SMRD_Real_Load_IMM_ci <0x04, S_LOAD_DWORDX16_IMM>; +def S_BUFFER_LOAD_DWORD_IMM_ci : SMRD_Real_Load_IMM_ci <0x08, S_BUFFER_LOAD_DWORD_IMM>; +def S_BUFFER_LOAD_DWORDX2_IMM_ci : SMRD_Real_Load_IMM_ci <0x09, S_BUFFER_LOAD_DWORDX2_IMM>; +def S_BUFFER_LOAD_DWORDX4_IMM_ci : SMRD_Real_Load_IMM_ci <0x0a, S_BUFFER_LOAD_DWORDX4_IMM>; +def S_BUFFER_LOAD_DWORDX8_IMM_ci : SMRD_Real_Load_IMM_ci <0x0b, S_BUFFER_LOAD_DWORDX8_IMM>; +def S_BUFFER_LOAD_DWORDX16_IMM_ci : SMRD_Real_Load_IMM_ci <0x0c, S_BUFFER_LOAD_DWORDX16_IMM>; + +class SMRD_Real_ci <bits<5> op, SM_Pseudo ps> + : SM_Real<ps> + , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> + , Enc32 { + + let AssemblerPredicates = [isCIOnly]; + let DecoderNamespace = "CI"; + + let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?); + let Inst{8} = imm; + let Inst{14-9} = !if(ps.has_sbase, sbase{6-1}, ?); + let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?); + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding +} + +def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>; + +let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in { + +class SMRD_Pattern_ci <string Instr, ValueType vt> : Pat < + (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), + (vt (!cast<SM_Pseudo>(Instr#"_IMM_ci") $sbase, $offset, 0))> { + let Predicates = [isCIOnly]; +} + +def : SMRD_Pattern_ci <"S_LOAD_DWORD", i32>; +def : SMRD_Pattern_ci <"S_LOAD_DWORDX2", v2i32>; +def : SMRD_Pattern_ci <"S_LOAD_DWORDX4", v4i32>; +def : SMRD_Pattern_ci <"S_LOAD_DWORDX8", v8i32>; +def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>; + +def : Pat < + (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), + (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> { + let Predicates = [isCI]; // should this be isCIOnly? +} + +} // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity + diff --git a/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td new file mode 100644 index 0000000..73cd577 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -0,0 +1,1232 @@ +//===-- SOPInstructions.td - SOP Instruction Defintions -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def GPRIdxModeMatchClass : AsmOperandClass { + let Name = "GPRIdxMode"; + let PredicateMethod = "isGPRIdxMode"; + let RenderMethod = "addImmOperands"; +} + +def GPRIdxMode : Operand<i32> { + let PrintMethod = "printVGPRIndexMode"; + let ParserMatchClass = GPRIdxModeMatchClass; + let OperandType = "OPERAND_IMMEDIATE"; +} + +//===----------------------------------------------------------------------===// +// SOP1 Instructions +//===----------------------------------------------------------------------===// + +class SOP1_Pseudo <string opName, dag outs, dag ins, + string asmOps, list<dag> pattern=[]> : + InstSI <outs, ins, "", pattern>, + SIMCInstr<opName, SIEncodingFamily.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; + let SubtargetPredicate = isGCN; + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOP1 = 1; + let SchedRW = [WriteSALU]; + let Size = 4; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + string AsmOperands = asmOps; + + bits<1> has_src0 = 1; + bits<1> has_sdst = 1; +} + +class SOP1_Real<bits<8> op, SOP1_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, + ps.Mnemonic # " " # ps.AsmOperands, []>, + Enc32 { + + let isPseudo = 0; + let isCodeGenOnly = 0; + let Size = 4; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + + // encoding + bits<7> sdst; + bits<8> src0; + + let Inst{7-0} = !if(ps.has_src0, src0, ?); + let Inst{15-8} = op; + let Inst{22-16} = !if(ps.has_sdst, sdst, ?); + let Inst{31-23} = 0x17d; //encoding; +} + +class SOP1_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < + opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0), + "$sdst, $src0", pattern +>; + +// 32-bit input, no output. +class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo < + opName, (outs), (ins SSrc_b32:$src0), + "$src0", pattern> { + let has_sdst = 0; +} + +class SOP1_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < + opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0), + "$sdst, $src0", pattern +>; + +// 64-bit input, 32-bit output. +class SOP1_32_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < + opName, (outs SReg_32:$sdst), (ins SSrc_b64:$src0), + "$sdst, $src0", pattern +>; + +// 32-bit input, 64-bit output. +class SOP1_64_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < + opName, (outs SReg_64:$sdst), (ins SSrc_b32:$src0), + "$sdst, $src0", pattern +>; + +// no input, 64-bit output. +class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < + opName, (outs SReg_64:$sdst), (ins), "$sdst", pattern> { + let has_src0 = 0; +} + +// 64-bit input, no output +class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < + opName, (outs), (ins SReg_64:$src0), "$src0", pattern> { + let has_sdst = 0; +} + + +let isMoveImm = 1 in { + let isReMaterializable = 1, isAsCheapAsAMove = 1 in { + def S_MOV_B32 : SOP1_32 <"s_mov_b32">; + def S_MOV_B64 : SOP1_64 <"s_mov_b64">; + } // End isRematerializeable = 1 + + let Uses = [SCC] in { + def S_CMOV_B32 : SOP1_32 <"s_cmov_b32">; + def S_CMOV_B64 : SOP1_64 <"s_cmov_b64">; + } // End Uses = [SCC] +} // End isMoveImm = 1 + +let Defs = [SCC] in { + def S_NOT_B32 : SOP1_32 <"s_not_b32", + [(set i32:$sdst, (not i32:$src0))] + >; + + def S_NOT_B64 : SOP1_64 <"s_not_b64", + [(set i64:$sdst, (not i64:$src0))] + >; + def S_WQM_B32 : SOP1_32 <"s_wqm_b32">; + def S_WQM_B64 : SOP1_64 <"s_wqm_b64">; +} // End Defs = [SCC] + + +def S_BREV_B32 : SOP1_32 <"s_brev_b32", + [(set i32:$sdst, (bitreverse i32:$src0))] +>; +def S_BREV_B64 : SOP1_64 <"s_brev_b64">; + +let Defs = [SCC] in { +def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">; +def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; +def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", + [(set i32:$sdst, (ctpop i32:$src0))] +>; +def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64">; +} // End Defs = [SCC] + +def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">; +def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">; +def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32", + [(set i32:$sdst, (cttz_zero_undef i32:$src0))] +>; +def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">; + +def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32", + [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))] +>; + +def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64">; +def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32", + [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))] +>; +def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">; +def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8", + [(set i32:$sdst, (sext_inreg i32:$src0, i8))] +>; +def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16", + [(set i32:$sdst, (sext_inreg i32:$src0, i16))] +>; + +def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">; +def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">; +def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">; +def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">; +def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">; + +let isTerminator = 1, isBarrier = 1, + isBranch = 1, isIndirectBranch = 1 in { +def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">; +} +def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">; +def S_RFE_B64 : SOP1_1 <"s_rfe_b64">; + +let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { + +def S_AND_SAVEEXEC_B64 : SOP1_64 <"s_and_saveexec_b64">; +def S_OR_SAVEEXEC_B64 : SOP1_64 <"s_or_saveexec_b64">; +def S_XOR_SAVEEXEC_B64 : SOP1_64 <"s_xor_saveexec_b64">; +def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <"s_andn2_saveexec_b64">; +def S_ORN2_SAVEEXEC_B64 : SOP1_64 <"s_orn2_saveexec_b64">; +def S_NAND_SAVEEXEC_B64 : SOP1_64 <"s_nand_saveexec_b64">; +def S_NOR_SAVEEXEC_B64 : SOP1_64 <"s_nor_saveexec_b64">; +def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">; + +} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] + +def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32">; +def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64">; + +let Uses = [M0] in { +def S_MOVRELS_B32 : SOP1_32 <"s_movrels_b32">; +def S_MOVRELS_B64 : SOP1_64 <"s_movrels_b64">; +def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">; +def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">; +} // End Uses = [M0] + +def S_CBRANCH_JOIN : SOP1_1 <"s_cbranch_join">; +def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">; +let Defs = [SCC] in { +def S_ABS_I32 : SOP1_32 <"s_abs_i32">; +} // End Defs = [SCC] +def S_MOV_FED_B32 : SOP1_32 <"s_mov_fed_b32">; + +let SubtargetPredicate = HasVGPRIndexMode in { +def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> { + let Uses = [M0]; + let Defs = [M0]; +} +} + +//===----------------------------------------------------------------------===// +// SOP2 Instructions +//===----------------------------------------------------------------------===// + +class SOP2_Pseudo<string opName, dag outs, dag ins, + string asmOps, list<dag> pattern=[]> : + InstSI<outs, ins, "", pattern>, + SIMCInstr<opName, SIEncodingFamily.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; + let SubtargetPredicate = isGCN; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOP2 = 1; + let SchedRW = [WriteSALU]; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + string AsmOperands = asmOps; + + bits<1> has_sdst = 1; + + // Pseudo instructions have no encodings, but adding this field here allows + // us to do: + // let sdst = xxx in { + // for multiclasses that include both real and pseudo instructions. + // field bits<7> sdst = 0; + // let Size = 4; // Do we need size here? +} + +class SOP2_Real<bits<7> op, SOP2_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, + ps.Mnemonic # " " # ps.AsmOperands, []>, + Enc32 { + let isPseudo = 0; + let isCodeGenOnly = 0; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + + // encoding + bits<7> sdst; + bits<8> src0; + bits<8> src1; + + let Inst{7-0} = src0; + let Inst{15-8} = src1; + let Inst{22-16} = !if(ps.has_sdst, sdst, ?); + let Inst{29-23} = op; + let Inst{31-30} = 0x2; // encoding +} + + +class SOP2_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo < + opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0, SSrc_b32:$src1), + "$sdst, $src0, $src1", pattern +>; + +class SOP2_64 <string opName, list<dag> pattern=[]> : SOP2_Pseudo < + opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), + "$sdst, $src0, $src1", pattern +>; + +class SOP2_64_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo < + opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b32:$src1), + "$sdst, $src0, $src1", pattern +>; + +class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo < + opName, (outs SReg_64:$sdst), (ins SSrc_b32:$src0, SSrc_b32:$src1), + "$sdst, $src0, $src1", pattern +>; + +let Defs = [SCC] in { // Carry out goes to SCC +let isCommutable = 1 in { +def S_ADD_U32 : SOP2_32 <"s_add_u32">; +def S_ADD_I32 : SOP2_32 <"s_add_i32", + [(set i32:$sdst, (add SSrc_b32:$src0, SSrc_b32:$src1))] +>; +} // End isCommutable = 1 + +def S_SUB_U32 : SOP2_32 <"s_sub_u32">; +def S_SUB_I32 : SOP2_32 <"s_sub_i32", + [(set i32:$sdst, (sub SSrc_b32:$src0, SSrc_b32:$src1))] +>; + +let Uses = [SCC] in { // Carry in comes from SCC +let isCommutable = 1 in { +def S_ADDC_U32 : SOP2_32 <"s_addc_u32", + [(set i32:$sdst, (adde (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>; +} // End isCommutable = 1 + +def S_SUBB_U32 : SOP2_32 <"s_subb_u32", + [(set i32:$sdst, (sube (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>; +} // End Uses = [SCC] + + +let isCommutable = 1 in { +def S_MIN_I32 : SOP2_32 <"s_min_i32", + [(set i32:$sdst, (smin i32:$src0, i32:$src1))] +>; +def S_MIN_U32 : SOP2_32 <"s_min_u32", + [(set i32:$sdst, (umin i32:$src0, i32:$src1))] +>; +def S_MAX_I32 : SOP2_32 <"s_max_i32", + [(set i32:$sdst, (smax i32:$src0, i32:$src1))] +>; +def S_MAX_U32 : SOP2_32 <"s_max_u32", + [(set i32:$sdst, (umax i32:$src0, i32:$src1))] +>; +} // End isCommutable = 1 +} // End Defs = [SCC] + + +let Uses = [SCC] in { + def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">; + def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">; +} // End Uses = [SCC] + +let Defs = [SCC] in { +let isCommutable = 1 in { +def S_AND_B32 : SOP2_32 <"s_and_b32", + [(set i32:$sdst, (and i32:$src0, i32:$src1))] +>; + +def S_AND_B64 : SOP2_64 <"s_and_b64", + [(set i64:$sdst, (and i64:$src0, i64:$src1))] +>; + +def S_OR_B32 : SOP2_32 <"s_or_b32", + [(set i32:$sdst, (or i32:$src0, i32:$src1))] +>; + +def S_OR_B64 : SOP2_64 <"s_or_b64", + [(set i64:$sdst, (or i64:$src0, i64:$src1))] +>; + +def S_XOR_B32 : SOP2_32 <"s_xor_b32", + [(set i32:$sdst, (xor i32:$src0, i32:$src1))] +>; + +def S_XOR_B64 : SOP2_64 <"s_xor_b64", + [(set i64:$sdst, (xor i64:$src0, i64:$src1))] +>; +} // End isCommutable = 1 + +def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">; +def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">; +def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">; +def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">; +def S_NAND_B32 : SOP2_32 <"s_nand_b32">; +def S_NAND_B64 : SOP2_64 <"s_nand_b64">; +def S_NOR_B32 : SOP2_32 <"s_nor_b32">; +def S_NOR_B64 : SOP2_64 <"s_nor_b64">; +def S_XNOR_B32 : SOP2_32 <"s_xnor_b32">; +def S_XNOR_B64 : SOP2_64 <"s_xnor_b64">; +} // End Defs = [SCC] + +// Use added complexity so these patterns are preferred to the VALU patterns. +let AddedComplexity = 1 in { + +let Defs = [SCC] in { +def S_LSHL_B32 : SOP2_32 <"s_lshl_b32", + [(set i32:$sdst, (shl i32:$src0, i32:$src1))] +>; +def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64", + [(set i64:$sdst, (shl i64:$src0, i32:$src1))] +>; +def S_LSHR_B32 : SOP2_32 <"s_lshr_b32", + [(set i32:$sdst, (srl i32:$src0, i32:$src1))] +>; +def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64", + [(set i64:$sdst, (srl i64:$src0, i32:$src1))] +>; +def S_ASHR_I32 : SOP2_32 <"s_ashr_i32", + [(set i32:$sdst, (sra i32:$src0, i32:$src1))] +>; +def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64", + [(set i64:$sdst, (sra i64:$src0, i32:$src1))] +>; +} // End Defs = [SCC] + +def S_BFM_B32 : SOP2_32 <"s_bfm_b32", + [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>; +def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">; +def S_MUL_I32 : SOP2_32 <"s_mul_i32", + [(set i32:$sdst, (mul i32:$src0, i32:$src1))]> { + let isCommutable = 1; +} + +} // End AddedComplexity = 1 + +let Defs = [SCC] in { +def S_BFE_U32 : SOP2_32 <"s_bfe_u32">; +def S_BFE_I32 : SOP2_32 <"s_bfe_i32">; +def S_BFE_U64 : SOP2_64_32 <"s_bfe_u64">; +def S_BFE_I64 : SOP2_64_32 <"s_bfe_i64">; +} // End Defs = [SCC] + +def S_CBRANCH_G_FORK : SOP2_Pseudo < + "s_cbranch_g_fork", (outs), + (ins SReg_64:$src0, SReg_64:$src1), + "$src0, $src1" +> { + let has_sdst = 0; +} + +let Defs = [SCC] in { +def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">; +} // End Defs = [SCC] + + +//===----------------------------------------------------------------------===// +// SOPK Instructions +//===----------------------------------------------------------------------===// + +class SOPK_Pseudo <string opName, dag outs, dag ins, + string asmOps, list<dag> pattern=[]> : + InstSI <outs, ins, "", pattern>, + SIMCInstr<opName, SIEncodingFamily.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; + let SubtargetPredicate = isGCN; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPK = 1; + let SchedRW = [WriteSALU]; + let UseNamedOperandTable = 1; + string Mnemonic = opName; + string AsmOperands = asmOps; + + bits<1> has_sdst = 1; +} + +class SOPK_Real<bits<5> op, SOPK_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, + ps.Mnemonic # " " # ps.AsmOperands, []> { + let isPseudo = 0; + let isCodeGenOnly = 0; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let DisableEncoding = ps.DisableEncoding; + let Constraints = ps.Constraints; + + // encoding + bits<7> sdst; + bits<16> simm16; + bits<32> imm; +} + +class SOPK_Real32<bits<5> op, SOPK_Pseudo ps> : + SOPK_Real <op, ps>, + Enc32 { + let Inst{15-0} = simm16; + let Inst{22-16} = !if(ps.has_sdst, sdst, ?); + let Inst{27-23} = op; + let Inst{31-28} = 0xb; //encoding +} + +class SOPK_Real64<bits<5> op, SOPK_Pseudo ps> : + SOPK_Real<op, ps>, + Enc64 { + let Inst{15-0} = simm16; + let Inst{22-16} = !if(ps.has_sdst, sdst, ?); + let Inst{27-23} = op; + let Inst{31-28} = 0xb; //encoding + let Inst{63-32} = imm; +} + +class SOPKInstTable <bit is_sopk, string cmpOp = ""> { + bit IsSOPK = is_sopk; + string BaseCmpOp = cmpOp; +} + +class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo < + opName, + (outs SReg_32:$sdst), + (ins u16imm:$simm16), + "$sdst, $simm16", + pattern>; + +class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo < + opName, + (outs), + (ins SReg_32:$sdst, u16imm:$simm16), + "$sdst, $simm16", []>, + SOPKInstTable<1, base_op>{ + let Defs = [SCC]; +} + +class SOPK_32TIE <string opName, list<dag> pattern=[]> : SOPK_Pseudo < + opName, + (outs SReg_32:$sdst), + (ins SReg_32:$src0, u16imm:$simm16), + "$sdst, $simm16", + pattern +>; + +let isReMaterializable = 1, isMoveImm = 1 in { +def S_MOVK_I32 : SOPK_32 <"s_movk_i32">; +} // End isReMaterializable = 1 +let Uses = [SCC] in { +def S_CMOVK_I32 : SOPK_32 <"s_cmovk_i32">; +} + +let isCompare = 1 in { + +// This instruction is disabled for now until we can figure out how to teach +// the instruction selector to correctly use the S_CMP* vs V_CMP* +// instructions. +// +// When this instruction is enabled the code generator sometimes produces this +// invalid sequence: +// +// SCC = S_CMPK_EQ_I32 SGPR0, imm +// VCC = COPY SCC +// VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 +// +// def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", +// [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] +// >; + +def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32">; +def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32">; +def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32">; +def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32">; +def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32">; +def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32">; + +let SOPKZext = 1 in { +def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32">; +def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32">; +def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32">; +def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32">; +def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32">; +def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32">; +} // End SOPKZext = 1 +} // End isCompare = 1 + +let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", + Constraints = "$sdst = $src0" in { + def S_ADDK_I32 : SOPK_32TIE <"s_addk_i32">; + def S_MULK_I32 : SOPK_32TIE <"s_mulk_i32">; +} + +def S_CBRANCH_I_FORK : SOPK_Pseudo < + "s_cbranch_i_fork", + (outs), (ins SReg_64:$sdst, u16imm:$simm16), + "$sdst, $simm16" +>; + +let mayLoad = 1 in { +def S_GETREG_B32 : SOPK_Pseudo < + "s_getreg_b32", + (outs SReg_32:$sdst), (ins hwreg:$simm16), + "$sdst, $simm16" +>; +} + +let hasSideEffects = 1 in { + +def S_SETREG_B32 : SOPK_Pseudo < + "s_setreg_b32", + (outs), (ins SReg_32:$sdst, hwreg:$simm16), + "$simm16, $sdst", + [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))] +>; + +// FIXME: Not on SI? +//def S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32">; + +def S_SETREG_IMM32_B32 : SOPK_Pseudo < + "s_setreg_imm32_b32", + (outs), (ins i32imm:$imm, hwreg:$simm16), + "$simm16, $imm"> { + let Size = 8; // Unlike every other SOPK instruction. + let has_sdst = 0; +} + +} // End hasSideEffects = 1 + +//===----------------------------------------------------------------------===// +// SOPC Instructions +//===----------------------------------------------------------------------===// + +class SOPCe <bits<7> op> : Enc32 { + bits<8> src0; + bits<8> src1; + + let Inst{7-0} = src0; + let Inst{15-8} = src1; + let Inst{22-16} = op; + let Inst{31-23} = 0x17e; +} + +class SOPC <bits<7> op, dag outs, dag ins, string asm, + list<dag> pattern = []> : + InstSI<outs, ins, asm, pattern>, SOPCe <op> { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPC = 1; + let isCodeGenOnly = 0; + let Defs = [SCC]; + let SchedRW = [WriteSALU]; + let UseNamedOperandTable = 1; + let SubtargetPredicate = isGCN; +} + +class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1, + string opName, list<dag> pattern = []> : SOPC < + op, (outs), (ins rc0:$src0, rc1:$src1), + opName#" $src0, $src1", pattern > { + let Defs = [SCC]; +} +class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, + string opName, PatLeaf cond> : SOPC_Base < + op, rc, rc, opName, + [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > { +} + +class SOPC_CMP_32<bits<7> op, string opName, + PatLeaf cond = COND_NULL, string revOp = opName> + : SOPC_Helper<op, SSrc_b32, i32, opName, cond>, + Commutable_REV<revOp, !eq(revOp, opName)>, + SOPKInstTable<0, opName> { + let isCompare = 1; + let isCommutable = 1; +} + +class SOPC_CMP_64<bits<7> op, string opName, + PatLeaf cond = COND_NULL, string revOp = opName> + : SOPC_Helper<op, SSrc_b64, i64, opName, cond>, + Commutable_REV<revOp, !eq(revOp, opName)> { + let isCompare = 1; + let isCommutable = 1; +} + +class SOPC_32<bits<7> op, string opName, list<dag> pattern = []> + : SOPC_Base<op, SSrc_b32, SSrc_b32, opName, pattern>; + +class SOPC_64_32<bits<7> op, string opName, list<dag> pattern = []> + : SOPC_Base<op, SSrc_b64, SSrc_b32, opName, pattern>; + +def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00, "s_cmp_eq_i32">; +def S_CMP_LG_I32 : SOPC_CMP_32 <0x01, "s_cmp_lg_i32">; +def S_CMP_GT_I32 : SOPC_CMP_32 <0x02, "s_cmp_gt_i32", COND_SGT>; +def S_CMP_GE_I32 : SOPC_CMP_32 <0x03, "s_cmp_ge_i32", COND_SGE>; +def S_CMP_LT_I32 : SOPC_CMP_32 <0x04, "s_cmp_lt_i32", COND_SLT, "s_cmp_gt_i32">; +def S_CMP_LE_I32 : SOPC_CMP_32 <0x05, "s_cmp_le_i32", COND_SLE, "s_cmp_ge_i32">; +def S_CMP_EQ_U32 : SOPC_CMP_32 <0x06, "s_cmp_eq_u32", COND_EQ>; +def S_CMP_LG_U32 : SOPC_CMP_32 <0x07, "s_cmp_lg_u32", COND_NE>; +def S_CMP_GT_U32 : SOPC_CMP_32 <0x08, "s_cmp_gt_u32", COND_UGT>; +def S_CMP_GE_U32 : SOPC_CMP_32 <0x09, "s_cmp_ge_u32", COND_UGE>; +def S_CMP_LT_U32 : SOPC_CMP_32 <0x0a, "s_cmp_lt_u32", COND_ULT, "s_cmp_gt_u32">; +def S_CMP_LE_U32 : SOPC_CMP_32 <0x0b, "s_cmp_le_u32", COND_ULE, "s_cmp_ge_u32">; + +def S_BITCMP0_B32 : SOPC_32 <0x0c, "s_bitcmp0_b32">; +def S_BITCMP1_B32 : SOPC_32 <0x0d, "s_bitcmp1_b32">; +def S_BITCMP0_B64 : SOPC_64_32 <0x0e, "s_bitcmp0_b64">; +def S_BITCMP1_B64 : SOPC_64_32 <0x0f, "s_bitcmp1_b64">; +def S_SETVSKIP : SOPC_32 <0x10, "s_setvskip">; + +let SubtargetPredicate = isVI in { +def S_CMP_EQ_U64 : SOPC_CMP_64 <0x12, "s_cmp_eq_u64", COND_EQ>; +def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>; +} + +let SubtargetPredicate = HasVGPRIndexMode in { +def S_SET_GPR_IDX_ON : SOPC <0x11, + (outs), + (ins SSrc_b32:$src0, GPRIdxMode:$src1), + "s_set_gpr_idx_on $src0,$src1"> { + let Defs = [M0]; // No scc def + let Uses = [M0]; // Other bits of m0 unmodified. + let hasSideEffects = 1; // Sets mode.gpr_idx_en + let FixedSize = 1; +} +} + +//===----------------------------------------------------------------------===// +// SOPP Instructions +//===----------------------------------------------------------------------===// + +class SOPPe <bits<7> op> : Enc32 { + bits <16> simm16; + + let Inst{15-0} = simm16; + let Inst{22-16} = op; + let Inst{31-23} = 0x17f; // encoding +} + +class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> : + InstSI <(outs), ins, asm, pattern >, SOPPe <op> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPP = 1; + let Size = 4; + let SchedRW = [WriteSALU]; + + let UseNamedOperandTable = 1; + let SubtargetPredicate = isGCN; +} + + +def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; + +let isTerminator = 1 in { + +def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", + [(AMDGPUendpgm)]> { + let simm16 = 0; + let isBarrier = 1; + let isReturn = 1; +} + +let isBranch = 1, SchedRW = [WriteBranch] in { +def S_BRANCH : SOPP < + 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", + [(br bb:$simm16)]> { + let isBarrier = 1; +} + +let Uses = [SCC] in { +def S_CBRANCH_SCC0 : SOPP < + 0x00000004, (ins sopp_brtarget:$simm16), + "s_cbranch_scc0 $simm16" +>; +def S_CBRANCH_SCC1 : SOPP < + 0x00000005, (ins sopp_brtarget:$simm16), + "s_cbranch_scc1 $simm16", + [(si_uniform_br_scc SCC, bb:$simm16)] +>; +} // End Uses = [SCC] + +let Uses = [VCC] in { +def S_CBRANCH_VCCZ : SOPP < + 0x00000006, (ins sopp_brtarget:$simm16), + "s_cbranch_vccz $simm16" +>; +def S_CBRANCH_VCCNZ : SOPP < + 0x00000007, (ins sopp_brtarget:$simm16), + "s_cbranch_vccnz $simm16" +>; +} // End Uses = [VCC] + +let Uses = [EXEC] in { +def S_CBRANCH_EXECZ : SOPP < + 0x00000008, (ins sopp_brtarget:$simm16), + "s_cbranch_execz $simm16" +>; +def S_CBRANCH_EXECNZ : SOPP < + 0x00000009, (ins sopp_brtarget:$simm16), + "s_cbranch_execnz $simm16" +>; +} // End Uses = [EXEC] + + +} // End isBranch = 1 +} // End isTerminator = 1 + +let hasSideEffects = 1 in { +def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", + [(int_amdgcn_s_barrier)]> { + let SchedRW = [WriteBarrier]; + let simm16 = 0; + let mayLoad = 1; + let mayStore = 1; + let isConvergent = 1; +} + +let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in +def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; +def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; + +// On SI the documentation says sleep for approximately 64 * low 2 +// bits, consistent with the reported maximum of 448. On VI the +// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the +// maximum really 15 on VI? +def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16), + "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> { + let hasSideEffects = 1; + let mayLoad = 1; + let mayStore = 1; +} + +def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">; + +let Uses = [EXEC, M0] in { +// FIXME: Should this be mayLoad+mayStore? +def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", + [(AMDGPUsendmsg (i32 imm:$simm16))] +>; + +def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16", + [(AMDGPUsendmsghalt (i32 imm:$simm16))] +>; +} // End Uses = [EXEC, M0] + +def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; +def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { + let simm16 = 0; +} +def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16", + [(int_amdgcn_s_incperflevel SIMM16bit:$simm16)]> { + let hasSideEffects = 1; + let mayLoad = 1; + let mayStore = 1; +} +def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16", + [(int_amdgcn_s_decperflevel SIMM16bit:$simm16)]> { + let hasSideEffects = 1; + let mayLoad = 1; + let mayStore = 1; +} +def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { + let simm16 = 0; +} + +let SubtargetPredicate = HasVGPRIndexMode in { +def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> { + let simm16 = 0; +} +} +} // End hasSideEffects + +let SubtargetPredicate = HasVGPRIndexMode in { +def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16), + "s_set_gpr_idx_mode$simm16"> { + let Defs = [M0]; +} +} + +let Predicates = [isGCN] in { + +//===----------------------------------------------------------------------===// +// S_GETREG_B32 Intrinsic Pattern. +//===----------------------------------------------------------------------===// +def : Pat < + (int_amdgcn_s_getreg imm:$simm16), + (S_GETREG_B32 (as_i16imm $simm16)) +>; + +//===----------------------------------------------------------------------===// +// SOP1 Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i64 (ctpop i64:$src)), + (i64 (REG_SEQUENCE SReg_64, + (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, + (S_MOV_B32 (i32 0)), sub1)) +>; + +def : Pat < + (i32 (smax i32:$x, (i32 (ineg i32:$x)))), + (S_ABS_I32 $x) +>; + +def : Pat < + (i16 imm:$imm), + (S_MOV_B32 imm:$imm) +>; + +// Same as a 32-bit inreg +def : Pat< + (i32 (sext i16:$src)), + (S_SEXT_I32_I16 $src) +>; + + +//===----------------------------------------------------------------------===// +// SOP2 Patterns +//===----------------------------------------------------------------------===// + +// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector +// case, the sgpr-copies pass will fix this to use the vector version. +def : Pat < + (i32 (addc i32:$src0, i32:$src1)), + (S_ADD_U32 $src0, $src1) +>; + +// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that +// REG_SEQUENCE patterns don't support instructions with multiple +// outputs. +def : Pat< + (i64 (zext i16:$src)), + (REG_SEQUENCE SReg_64, + (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0, + (S_MOV_B32 (i32 0)), sub1) +>; + +def : Pat < + (i64 (sext i16:$src)), + (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0, + (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1) +>; + +def : Pat< + (i32 (zext i16:$src)), + (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src) +>; + + + +//===----------------------------------------------------------------------===// +// SOPP Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (int_amdgcn_s_waitcnt i32:$simm16), + (S_WAITCNT (as_i16imm $simm16)) +>; + +} // End isGCN predicate + + +//===----------------------------------------------------------------------===// +// Real target instructions, move this to the appropriate subtarget TD file +//===----------------------------------------------------------------------===// + +class Select_si<string opName> : + SIMCInstr<opName, SIEncodingFamily.SI> { + list<Predicate> AssemblerPredicates = [isSICI]; + string DecoderNamespace = "SICI"; +} + +class SOP1_Real_si<bits<8> op, SOP1_Pseudo ps> : + SOP1_Real<op, ps>, + Select_si<ps.Mnemonic>; + +class SOP2_Real_si<bits<7> op, SOP2_Pseudo ps> : + SOP2_Real<op, ps>, + Select_si<ps.Mnemonic>; + +class SOPK_Real_si<bits<5> op, SOPK_Pseudo ps> : + SOPK_Real32<op, ps>, + Select_si<ps.Mnemonic>; + +def S_MOV_B32_si : SOP1_Real_si <0x03, S_MOV_B32>; +def S_MOV_B64_si : SOP1_Real_si <0x04, S_MOV_B64>; +def S_CMOV_B32_si : SOP1_Real_si <0x05, S_CMOV_B32>; +def S_CMOV_B64_si : SOP1_Real_si <0x06, S_CMOV_B64>; +def S_NOT_B32_si : SOP1_Real_si <0x07, S_NOT_B32>; +def S_NOT_B64_si : SOP1_Real_si <0x08, S_NOT_B64>; +def S_WQM_B32_si : SOP1_Real_si <0x09, S_WQM_B32>; +def S_WQM_B64_si : SOP1_Real_si <0x0a, S_WQM_B64>; +def S_BREV_B32_si : SOP1_Real_si <0x0b, S_BREV_B32>; +def S_BREV_B64_si : SOP1_Real_si <0x0c, S_BREV_B64>; +def S_BCNT0_I32_B32_si : SOP1_Real_si <0x0d, S_BCNT0_I32_B32>; +def S_BCNT0_I32_B64_si : SOP1_Real_si <0x0e, S_BCNT0_I32_B64>; +def S_BCNT1_I32_B32_si : SOP1_Real_si <0x0f, S_BCNT1_I32_B32>; +def S_BCNT1_I32_B64_si : SOP1_Real_si <0x10, S_BCNT1_I32_B64>; +def S_FF0_I32_B32_si : SOP1_Real_si <0x11, S_FF0_I32_B32>; +def S_FF0_I32_B64_si : SOP1_Real_si <0x12, S_FF0_I32_B64>; +def S_FF1_I32_B32_si : SOP1_Real_si <0x13, S_FF1_I32_B32>; +def S_FF1_I32_B64_si : SOP1_Real_si <0x14, S_FF1_I32_B64>; +def S_FLBIT_I32_B32_si : SOP1_Real_si <0x15, S_FLBIT_I32_B32>; +def S_FLBIT_I32_B64_si : SOP1_Real_si <0x16, S_FLBIT_I32_B64>; +def S_FLBIT_I32_si : SOP1_Real_si <0x17, S_FLBIT_I32>; +def S_FLBIT_I32_I64_si : SOP1_Real_si <0x18, S_FLBIT_I32_I64>; +def S_SEXT_I32_I8_si : SOP1_Real_si <0x19, S_SEXT_I32_I8>; +def S_SEXT_I32_I16_si : SOP1_Real_si <0x1a, S_SEXT_I32_I16>; +def S_BITSET0_B32_si : SOP1_Real_si <0x1b, S_BITSET0_B32>; +def S_BITSET0_B64_si : SOP1_Real_si <0x1c, S_BITSET0_B64>; +def S_BITSET1_B32_si : SOP1_Real_si <0x1d, S_BITSET1_B32>; +def S_BITSET1_B64_si : SOP1_Real_si <0x1e, S_BITSET1_B64>; +def S_GETPC_B64_si : SOP1_Real_si <0x1f, S_GETPC_B64>; +def S_SETPC_B64_si : SOP1_Real_si <0x20, S_SETPC_B64>; +def S_SWAPPC_B64_si : SOP1_Real_si <0x21, S_SWAPPC_B64>; +def S_RFE_B64_si : SOP1_Real_si <0x22, S_RFE_B64>; +def S_AND_SAVEEXEC_B64_si : SOP1_Real_si <0x24, S_AND_SAVEEXEC_B64>; +def S_OR_SAVEEXEC_B64_si : SOP1_Real_si <0x25, S_OR_SAVEEXEC_B64>; +def S_XOR_SAVEEXEC_B64_si : SOP1_Real_si <0x26, S_XOR_SAVEEXEC_B64>; +def S_ANDN2_SAVEEXEC_B64_si: SOP1_Real_si <0x27, S_ANDN2_SAVEEXEC_B64>; +def S_ORN2_SAVEEXEC_B64_si : SOP1_Real_si <0x28, S_ORN2_SAVEEXEC_B64>; +def S_NAND_SAVEEXEC_B64_si : SOP1_Real_si <0x29, S_NAND_SAVEEXEC_B64>; +def S_NOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2a, S_NOR_SAVEEXEC_B64>; +def S_XNOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2b, S_XNOR_SAVEEXEC_B64>; +def S_QUADMASK_B32_si : SOP1_Real_si <0x2c, S_QUADMASK_B32>; +def S_QUADMASK_B64_si : SOP1_Real_si <0x2d, S_QUADMASK_B64>; +def S_MOVRELS_B32_si : SOP1_Real_si <0x2e, S_MOVRELS_B32>; +def S_MOVRELS_B64_si : SOP1_Real_si <0x2f, S_MOVRELS_B64>; +def S_MOVRELD_B32_si : SOP1_Real_si <0x30, S_MOVRELD_B32>; +def S_MOVRELD_B64_si : SOP1_Real_si <0x31, S_MOVRELD_B64>; +def S_CBRANCH_JOIN_si : SOP1_Real_si <0x32, S_CBRANCH_JOIN>; +def S_MOV_REGRD_B32_si : SOP1_Real_si <0x33, S_MOV_REGRD_B32>; +def S_ABS_I32_si : SOP1_Real_si <0x34, S_ABS_I32>; +def S_MOV_FED_B32_si : SOP1_Real_si <0x35, S_MOV_FED_B32>; + +def S_ADD_U32_si : SOP2_Real_si <0x00, S_ADD_U32>; +def S_ADD_I32_si : SOP2_Real_si <0x02, S_ADD_I32>; +def S_SUB_U32_si : SOP2_Real_si <0x01, S_SUB_U32>; +def S_SUB_I32_si : SOP2_Real_si <0x03, S_SUB_I32>; +def S_ADDC_U32_si : SOP2_Real_si <0x04, S_ADDC_U32>; +def S_SUBB_U32_si : SOP2_Real_si <0x05, S_SUBB_U32>; +def S_MIN_I32_si : SOP2_Real_si <0x06, S_MIN_I32>; +def S_MIN_U32_si : SOP2_Real_si <0x07, S_MIN_U32>; +def S_MAX_I32_si : SOP2_Real_si <0x08, S_MAX_I32>; +def S_MAX_U32_si : SOP2_Real_si <0x09, S_MAX_U32>; +def S_CSELECT_B32_si : SOP2_Real_si <0x0a, S_CSELECT_B32>; +def S_CSELECT_B64_si : SOP2_Real_si <0x0b, S_CSELECT_B64>; +def S_AND_B32_si : SOP2_Real_si <0x0e, S_AND_B32>; +def S_AND_B64_si : SOP2_Real_si <0x0f, S_AND_B64>; +def S_OR_B32_si : SOP2_Real_si <0x10, S_OR_B32>; +def S_OR_B64_si : SOP2_Real_si <0x11, S_OR_B64>; +def S_XOR_B32_si : SOP2_Real_si <0x12, S_XOR_B32>; +def S_XOR_B64_si : SOP2_Real_si <0x13, S_XOR_B64>; +def S_ANDN2_B32_si : SOP2_Real_si <0x14, S_ANDN2_B32>; +def S_ANDN2_B64_si : SOP2_Real_si <0x15, S_ANDN2_B64>; +def S_ORN2_B32_si : SOP2_Real_si <0x16, S_ORN2_B32>; +def S_ORN2_B64_si : SOP2_Real_si <0x17, S_ORN2_B64>; +def S_NAND_B32_si : SOP2_Real_si <0x18, S_NAND_B32>; +def S_NAND_B64_si : SOP2_Real_si <0x19, S_NAND_B64>; +def S_NOR_B32_si : SOP2_Real_si <0x1a, S_NOR_B32>; +def S_NOR_B64_si : SOP2_Real_si <0x1b, S_NOR_B64>; +def S_XNOR_B32_si : SOP2_Real_si <0x1c, S_XNOR_B32>; +def S_XNOR_B64_si : SOP2_Real_si <0x1d, S_XNOR_B64>; +def S_LSHL_B32_si : SOP2_Real_si <0x1e, S_LSHL_B32>; +def S_LSHL_B64_si : SOP2_Real_si <0x1f, S_LSHL_B64>; +def S_LSHR_B32_si : SOP2_Real_si <0x20, S_LSHR_B32>; +def S_LSHR_B64_si : SOP2_Real_si <0x21, S_LSHR_B64>; +def S_ASHR_I32_si : SOP2_Real_si <0x22, S_ASHR_I32>; +def S_ASHR_I64_si : SOP2_Real_si <0x23, S_ASHR_I64>; +def S_BFM_B32_si : SOP2_Real_si <0x24, S_BFM_B32>; +def S_BFM_B64_si : SOP2_Real_si <0x25, S_BFM_B64>; +def S_MUL_I32_si : SOP2_Real_si <0x26, S_MUL_I32>; +def S_BFE_U32_si : SOP2_Real_si <0x27, S_BFE_U32>; +def S_BFE_I32_si : SOP2_Real_si <0x28, S_BFE_I32>; +def S_BFE_U64_si : SOP2_Real_si <0x29, S_BFE_U64>; +def S_BFE_I64_si : SOP2_Real_si <0x2a, S_BFE_I64>; +def S_CBRANCH_G_FORK_si : SOP2_Real_si <0x2b, S_CBRANCH_G_FORK>; +def S_ABSDIFF_I32_si : SOP2_Real_si <0x2c, S_ABSDIFF_I32>; + +def S_MOVK_I32_si : SOPK_Real_si <0x00, S_MOVK_I32>; +def S_CMOVK_I32_si : SOPK_Real_si <0x02, S_CMOVK_I32>; +def S_CMPK_EQ_I32_si : SOPK_Real_si <0x03, S_CMPK_EQ_I32>; +def S_CMPK_LG_I32_si : SOPK_Real_si <0x04, S_CMPK_LG_I32>; +def S_CMPK_GT_I32_si : SOPK_Real_si <0x05, S_CMPK_GT_I32>; +def S_CMPK_GE_I32_si : SOPK_Real_si <0x06, S_CMPK_GE_I32>; +def S_CMPK_LT_I32_si : SOPK_Real_si <0x07, S_CMPK_LT_I32>; +def S_CMPK_LE_I32_si : SOPK_Real_si <0x08, S_CMPK_LE_I32>; +def S_CMPK_EQ_U32_si : SOPK_Real_si <0x09, S_CMPK_EQ_U32>; +def S_CMPK_LG_U32_si : SOPK_Real_si <0x0a, S_CMPK_LG_U32>; +def S_CMPK_GT_U32_si : SOPK_Real_si <0x0b, S_CMPK_GT_U32>; +def S_CMPK_GE_U32_si : SOPK_Real_si <0x0c, S_CMPK_GE_U32>; +def S_CMPK_LT_U32_si : SOPK_Real_si <0x0d, S_CMPK_LT_U32>; +def S_CMPK_LE_U32_si : SOPK_Real_si <0x0e, S_CMPK_LE_U32>; +def S_ADDK_I32_si : SOPK_Real_si <0x0f, S_ADDK_I32>; +def S_MULK_I32_si : SOPK_Real_si <0x10, S_MULK_I32>; +def S_CBRANCH_I_FORK_si : SOPK_Real_si <0x11, S_CBRANCH_I_FORK>; +def S_GETREG_B32_si : SOPK_Real_si <0x12, S_GETREG_B32>; +def S_SETREG_B32_si : SOPK_Real_si <0x13, S_SETREG_B32>; +//def S_GETREG_REGRD_B32_si : SOPK_Real_si <0x14, S_GETREG_REGRD_B32>; // see pseudo for comments +def S_SETREG_IMM32_B32_si : SOPK_Real64<0x15, S_SETREG_IMM32_B32>, + Select_si<S_SETREG_IMM32_B32.Mnemonic>; + + +class Select_vi<string opName> : + SIMCInstr<opName, SIEncodingFamily.VI> { + list<Predicate> AssemblerPredicates = [isVI]; + string DecoderNamespace = "VI"; +} + +class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> : + SOP1_Real<op, ps>, + Select_vi<ps.Mnemonic>; + + +class SOP2_Real_vi<bits<7> op, SOP2_Pseudo ps> : + SOP2_Real<op, ps>, + Select_vi<ps.Mnemonic>; + +class SOPK_Real_vi<bits<5> op, SOPK_Pseudo ps> : + SOPK_Real32<op, ps>, + Select_vi<ps.Mnemonic>; + +def S_MOV_B32_vi : SOP1_Real_vi <0x00, S_MOV_B32>; +def S_MOV_B64_vi : SOP1_Real_vi <0x01, S_MOV_B64>; +def S_CMOV_B32_vi : SOP1_Real_vi <0x02, S_CMOV_B32>; +def S_CMOV_B64_vi : SOP1_Real_vi <0x03, S_CMOV_B64>; +def S_NOT_B32_vi : SOP1_Real_vi <0x04, S_NOT_B32>; +def S_NOT_B64_vi : SOP1_Real_vi <0x05, S_NOT_B64>; +def S_WQM_B32_vi : SOP1_Real_vi <0x06, S_WQM_B32>; +def S_WQM_B64_vi : SOP1_Real_vi <0x07, S_WQM_B64>; +def S_BREV_B32_vi : SOP1_Real_vi <0x08, S_BREV_B32>; +def S_BREV_B64_vi : SOP1_Real_vi <0x09, S_BREV_B64>; +def S_BCNT0_I32_B32_vi : SOP1_Real_vi <0x0a, S_BCNT0_I32_B32>; +def S_BCNT0_I32_B64_vi : SOP1_Real_vi <0x0b, S_BCNT0_I32_B64>; +def S_BCNT1_I32_B32_vi : SOP1_Real_vi <0x0c, S_BCNT1_I32_B32>; +def S_BCNT1_I32_B64_vi : SOP1_Real_vi <0x0d, S_BCNT1_I32_B64>; +def S_FF0_I32_B32_vi : SOP1_Real_vi <0x0e, S_FF0_I32_B32>; +def S_FF0_I32_B64_vi : SOP1_Real_vi <0x0f, S_FF0_I32_B64>; +def S_FF1_I32_B32_vi : SOP1_Real_vi <0x10, S_FF1_I32_B32>; +def S_FF1_I32_B64_vi : SOP1_Real_vi <0x11, S_FF1_I32_B64>; +def S_FLBIT_I32_B32_vi : SOP1_Real_vi <0x12, S_FLBIT_I32_B32>; +def S_FLBIT_I32_B64_vi : SOP1_Real_vi <0x13, S_FLBIT_I32_B64>; +def S_FLBIT_I32_vi : SOP1_Real_vi <0x14, S_FLBIT_I32>; +def S_FLBIT_I32_I64_vi : SOP1_Real_vi <0x15, S_FLBIT_I32_I64>; +def S_SEXT_I32_I8_vi : SOP1_Real_vi <0x16, S_SEXT_I32_I8>; +def S_SEXT_I32_I16_vi : SOP1_Real_vi <0x17, S_SEXT_I32_I16>; +def S_BITSET0_B32_vi : SOP1_Real_vi <0x18, S_BITSET0_B32>; +def S_BITSET0_B64_vi : SOP1_Real_vi <0x19, S_BITSET0_B64>; +def S_BITSET1_B32_vi : SOP1_Real_vi <0x1a, S_BITSET1_B32>; +def S_BITSET1_B64_vi : SOP1_Real_vi <0x1b, S_BITSET1_B64>; +def S_GETPC_B64_vi : SOP1_Real_vi <0x1c, S_GETPC_B64>; +def S_SETPC_B64_vi : SOP1_Real_vi <0x1d, S_SETPC_B64>; +def S_SWAPPC_B64_vi : SOP1_Real_vi <0x1e, S_SWAPPC_B64>; +def S_RFE_B64_vi : SOP1_Real_vi <0x1f, S_RFE_B64>; +def S_AND_SAVEEXEC_B64_vi : SOP1_Real_vi <0x20, S_AND_SAVEEXEC_B64>; +def S_OR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x21, S_OR_SAVEEXEC_B64>; +def S_XOR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x22, S_XOR_SAVEEXEC_B64>; +def S_ANDN2_SAVEEXEC_B64_vi: SOP1_Real_vi <0x23, S_ANDN2_SAVEEXEC_B64>; +def S_ORN2_SAVEEXEC_B64_vi : SOP1_Real_vi <0x24, S_ORN2_SAVEEXEC_B64>; +def S_NAND_SAVEEXEC_B64_vi : SOP1_Real_vi <0x25, S_NAND_SAVEEXEC_B64>; +def S_NOR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x26, S_NOR_SAVEEXEC_B64>; +def S_XNOR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x27, S_XNOR_SAVEEXEC_B64>; +def S_QUADMASK_B32_vi : SOP1_Real_vi <0x28, S_QUADMASK_B32>; +def S_QUADMASK_B64_vi : SOP1_Real_vi <0x29, S_QUADMASK_B64>; +def S_MOVRELS_B32_vi : SOP1_Real_vi <0x2a, S_MOVRELS_B32>; +def S_MOVRELS_B64_vi : SOP1_Real_vi <0x2b, S_MOVRELS_B64>; +def S_MOVRELD_B32_vi : SOP1_Real_vi <0x2c, S_MOVRELD_B32>; +def S_MOVRELD_B64_vi : SOP1_Real_vi <0x2d, S_MOVRELD_B64>; +def S_CBRANCH_JOIN_vi : SOP1_Real_vi <0x2e, S_CBRANCH_JOIN>; +def S_MOV_REGRD_B32_vi : SOP1_Real_vi <0x2f, S_MOV_REGRD_B32>; +def S_ABS_I32_vi : SOP1_Real_vi <0x30, S_ABS_I32>; +def S_MOV_FED_B32_vi : SOP1_Real_vi <0x31, S_MOV_FED_B32>; +def S_SET_GPR_IDX_IDX_vi : SOP1_Real_vi <0x32, S_SET_GPR_IDX_IDX>; + +def S_ADD_U32_vi : SOP2_Real_vi <0x00, S_ADD_U32>; +def S_ADD_I32_vi : SOP2_Real_vi <0x02, S_ADD_I32>; +def S_SUB_U32_vi : SOP2_Real_vi <0x01, S_SUB_U32>; +def S_SUB_I32_vi : SOP2_Real_vi <0x03, S_SUB_I32>; +def S_ADDC_U32_vi : SOP2_Real_vi <0x04, S_ADDC_U32>; +def S_SUBB_U32_vi : SOP2_Real_vi <0x05, S_SUBB_U32>; +def S_MIN_I32_vi : SOP2_Real_vi <0x06, S_MIN_I32>; +def S_MIN_U32_vi : SOP2_Real_vi <0x07, S_MIN_U32>; +def S_MAX_I32_vi : SOP2_Real_vi <0x08, S_MAX_I32>; +def S_MAX_U32_vi : SOP2_Real_vi <0x09, S_MAX_U32>; +def S_CSELECT_B32_vi : SOP2_Real_vi <0x0a, S_CSELECT_B32>; +def S_CSELECT_B64_vi : SOP2_Real_vi <0x0b, S_CSELECT_B64>; +def S_AND_B32_vi : SOP2_Real_vi <0x0c, S_AND_B32>; +def S_AND_B64_vi : SOP2_Real_vi <0x0d, S_AND_B64>; +def S_OR_B32_vi : SOP2_Real_vi <0x0e, S_OR_B32>; +def S_OR_B64_vi : SOP2_Real_vi <0x0f, S_OR_B64>; +def S_XOR_B32_vi : SOP2_Real_vi <0x10, S_XOR_B32>; +def S_XOR_B64_vi : SOP2_Real_vi <0x11, S_XOR_B64>; +def S_ANDN2_B32_vi : SOP2_Real_vi <0x12, S_ANDN2_B32>; +def S_ANDN2_B64_vi : SOP2_Real_vi <0x13, S_ANDN2_B64>; +def S_ORN2_B32_vi : SOP2_Real_vi <0x14, S_ORN2_B32>; +def S_ORN2_B64_vi : SOP2_Real_vi <0x15, S_ORN2_B64>; +def S_NAND_B32_vi : SOP2_Real_vi <0x16, S_NAND_B32>; +def S_NAND_B64_vi : SOP2_Real_vi <0x17, S_NAND_B64>; +def S_NOR_B32_vi : SOP2_Real_vi <0x18, S_NOR_B32>; +def S_NOR_B64_vi : SOP2_Real_vi <0x19, S_NOR_B64>; +def S_XNOR_B32_vi : SOP2_Real_vi <0x1a, S_XNOR_B32>; +def S_XNOR_B64_vi : SOP2_Real_vi <0x1b, S_XNOR_B64>; +def S_LSHL_B32_vi : SOP2_Real_vi <0x1c, S_LSHL_B32>; +def S_LSHL_B64_vi : SOP2_Real_vi <0x1d, S_LSHL_B64>; +def S_LSHR_B32_vi : SOP2_Real_vi <0x1e, S_LSHR_B32>; +def S_LSHR_B64_vi : SOP2_Real_vi <0x1f, S_LSHR_B64>; +def S_ASHR_I32_vi : SOP2_Real_vi <0x20, S_ASHR_I32>; +def S_ASHR_I64_vi : SOP2_Real_vi <0x21, S_ASHR_I64>; +def S_BFM_B32_vi : SOP2_Real_vi <0x22, S_BFM_B32>; +def S_BFM_B64_vi : SOP2_Real_vi <0x23, S_BFM_B64>; +def S_MUL_I32_vi : SOP2_Real_vi <0x24, S_MUL_I32>; +def S_BFE_U32_vi : SOP2_Real_vi <0x25, S_BFE_U32>; +def S_BFE_I32_vi : SOP2_Real_vi <0x26, S_BFE_I32>; +def S_BFE_U64_vi : SOP2_Real_vi <0x27, S_BFE_U64>; +def S_BFE_I64_vi : SOP2_Real_vi <0x28, S_BFE_I64>; +def S_CBRANCH_G_FORK_vi : SOP2_Real_vi <0x29, S_CBRANCH_G_FORK>; +def S_ABSDIFF_I32_vi : SOP2_Real_vi <0x2a, S_ABSDIFF_I32>; + +def S_MOVK_I32_vi : SOPK_Real_vi <0x00, S_MOVK_I32>; +def S_CMOVK_I32_vi : SOPK_Real_vi <0x01, S_CMOVK_I32>; +def S_CMPK_EQ_I32_vi : SOPK_Real_vi <0x02, S_CMPK_EQ_I32>; +def S_CMPK_LG_I32_vi : SOPK_Real_vi <0x03, S_CMPK_LG_I32>; +def S_CMPK_GT_I32_vi : SOPK_Real_vi <0x04, S_CMPK_GT_I32>; +def S_CMPK_GE_I32_vi : SOPK_Real_vi <0x05, S_CMPK_GE_I32>; +def S_CMPK_LT_I32_vi : SOPK_Real_vi <0x06, S_CMPK_LT_I32>; +def S_CMPK_LE_I32_vi : SOPK_Real_vi <0x07, S_CMPK_LE_I32>; +def S_CMPK_EQ_U32_vi : SOPK_Real_vi <0x08, S_CMPK_EQ_U32>; +def S_CMPK_LG_U32_vi : SOPK_Real_vi <0x09, S_CMPK_LG_U32>; +def S_CMPK_GT_U32_vi : SOPK_Real_vi <0x0A, S_CMPK_GT_U32>; +def S_CMPK_GE_U32_vi : SOPK_Real_vi <0x0B, S_CMPK_GE_U32>; +def S_CMPK_LT_U32_vi : SOPK_Real_vi <0x0C, S_CMPK_LT_U32>; +def S_CMPK_LE_U32_vi : SOPK_Real_vi <0x0D, S_CMPK_LE_U32>; +def S_ADDK_I32_vi : SOPK_Real_vi <0x0E, S_ADDK_I32>; +def S_MULK_I32_vi : SOPK_Real_vi <0x0F, S_MULK_I32>; +def S_CBRANCH_I_FORK_vi : SOPK_Real_vi <0x10, S_CBRANCH_I_FORK>; +def S_GETREG_B32_vi : SOPK_Real_vi <0x11, S_GETREG_B32>; +def S_SETREG_B32_vi : SOPK_Real_vi <0x12, S_SETREG_B32>; +//def S_GETREG_REGRD_B32_vi : SOPK_Real_vi <0x13, S_GETREG_REGRD_B32>; // see pseudo for comments +def S_SETREG_IMM32_B32_vi : SOPK_Real64<0x14, S_SETREG_IMM32_B32>, + Select_vi<S_SETREG_IMM32_B32.Mnemonic>; diff --git a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp index 2112135..9908fc0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp @@ -18,13 +18,20 @@ using namespace llvm; /// \brief The target which suports all AMD GPUs. This will eventually /// be deprecated and there will be a R600 target and a GCN target. -Target llvm::TheAMDGPUTarget; +Target &llvm::getTheAMDGPUTarget() { + static Target TheAMDGPUTarget; + return TheAMDGPUTarget; +} /// \brief The target for GCN GPUs -Target llvm::TheGCNTarget; +Target &llvm::getTheGCNTarget() { + static Target TheGCNTarget; + return TheGCNTarget; +} /// \brief Extern function to initialize the targets for the AMDGPU backend extern "C" void LLVMInitializeAMDGPUTargetInfo() { - RegisterTarget<Triple::r600, false> - R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX"); - RegisterTarget<Triple::amdgcn, false> GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs"); + RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600", + "AMD GPUs HD2XXX-HD6XXX"); + RegisterTarget<Triple::amdgcn, false> GCN(getTheGCNTarget(), "amdgcn", + "AMD GCN GPUs"); } diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index c6f9142..5f651d4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -8,10 +8,13 @@ //===----------------------------------------------------------------------===// #include "AMDGPUBaseInfo.h" #include "AMDGPU.h" +#include "SIDefines.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/SubtargetFeature.h" @@ -24,6 +27,55 @@ #include "AMDGPUGenRegisterInfo.inc" #undef GET_REGINFO_ENUM +#define GET_INSTRINFO_NAMED_OPS +#define GET_INSTRINFO_ENUM +#include "AMDGPUGenInstrInfo.inc" +#undef GET_INSTRINFO_NAMED_OPS +#undef GET_INSTRINFO_ENUM + +namespace { + +/// \returns Bit mask for given bit \p Shift and bit \p Width. +unsigned getBitMask(unsigned Shift, unsigned Width) { + return ((1 << Width) - 1) << Shift; +} + +/// \brief Packs \p Src into \p Dst for given bit \p Shift and bit \p Width. +/// +/// \returns Packed \p Dst. +unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) { + Dst &= ~(1 << Shift) & ~getBitMask(Shift, Width); + Dst |= (Src << Shift) & getBitMask(Shift, Width); + return Dst; +} + +/// \brief Unpacks bits from \p Src for given bit \p Shift and bit \p Width. +/// +/// \returns Unpacked bits. +unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) { + return (Src & getBitMask(Shift, Width)) >> Shift; +} + +/// \returns Vmcnt bit shift. +unsigned getVmcntBitShift() { return 0; } + +/// \returns Vmcnt bit width. +unsigned getVmcntBitWidth() { return 4; } + +/// \returns Expcnt bit shift. +unsigned getExpcntBitShift() { return 4; } + +/// \returns Expcnt bit width. +unsigned getExpcntBitWidth() { return 3; } + +/// \returns Lgkmcnt bit shift. +unsigned getLgkmcntBitShift() { return 8; } + +/// \returns Lgkmcnt bit width. +unsigned getLgkmcntBitWidth() { return 4; } + +} // anonymous namespace + namespace llvm { namespace AMDGPU { @@ -35,15 +87,27 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) { if (Features.test(FeatureISAVersion7_0_1)) return {7, 0, 1}; + if (Features.test(FeatureISAVersion7_0_2)) + return {7, 0, 2}; + if (Features.test(FeatureISAVersion8_0_0)) return {8, 0, 0}; if (Features.test(FeatureISAVersion8_0_1)) return {8, 0, 1}; + if (Features.test(FeatureISAVersion8_0_2)) + return {8, 0, 2}; + if (Features.test(FeatureISAVersion8_0_3)) return {8, 0, 3}; + if (Features.test(FeatureISAVersion8_0_4)) + return {8, 0, 4}; + + if (Features.test(FeatureISAVersion8_1_0)) + return {8, 1, 0}; + return {0, 0, 0}; } @@ -109,6 +173,10 @@ bool isReadOnlySegment(const GlobalValue *GV) { return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; } +bool shouldEmitConstantsToTextSection(const Triple &TT) { + return TT.getOS() != Triple::AMDHSA; +} + int getIntegerAttribute(const Function &F, StringRef Name, int Default) { Attribute A = F.getFnAttribute(Name); int Result = Default; @@ -124,8 +192,88 @@ int getIntegerAttribute(const Function &F, StringRef Name, int Default) { return Result; } -unsigned getMaximumWorkGroupSize(const Function &F) { - return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256); +std::pair<int, int> getIntegerPairAttribute(const Function &F, + StringRef Name, + std::pair<int, int> Default, + bool OnlyFirstRequired) { + Attribute A = F.getFnAttribute(Name); + if (!A.isStringAttribute()) + return Default; + + LLVMContext &Ctx = F.getContext(); + std::pair<int, int> Ints = Default; + std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(','); + if (Strs.first.trim().getAsInteger(0, Ints.first)) { + Ctx.emitError("can't parse first integer attribute " + Name); + return Default; + } + if (Strs.second.trim().getAsInteger(0, Ints.second)) { + if (!OnlyFirstRequired || Strs.second.trim().size()) { + Ctx.emitError("can't parse second integer attribute " + Name); + return Default; + } + } + + return Ints; +} + +unsigned getWaitcntBitMask(IsaVersion Version) { + unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth()); + unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth()); + unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth()); + return Vmcnt | Expcnt | Lgkmcnt; +} + +unsigned getVmcntBitMask(IsaVersion Version) { + return (1 << getVmcntBitWidth()) - 1; +} + +unsigned getExpcntBitMask(IsaVersion Version) { + return (1 << getExpcntBitWidth()) - 1; +} + +unsigned getLgkmcntBitMask(IsaVersion Version) { + return (1 << getLgkmcntBitWidth()) - 1; +} + +unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt) { + return unpackBits(Waitcnt, getVmcntBitShift(), getVmcntBitWidth()); +} + +unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt) { + return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth()); +} + +unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt) { + return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); +} + +void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt, + unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) { + Vmcnt = decodeVmcnt(Version, Waitcnt); + Expcnt = decodeExpcnt(Version, Waitcnt); + Lgkmcnt = decodeLgkmcnt(Version, Waitcnt); +} + +unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt) { + return packBits(Vmcnt, Waitcnt, getVmcntBitShift(), getVmcntBitWidth()); +} + +unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt) { + return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth()); +} + +unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) { + return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); +} + +unsigned encodeWaitcnt(IsaVersion Version, + unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) { + unsigned Waitcnt = getWaitcntBitMask(Version); + Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt); + Waitcnt = encodeExpcnt(Version, Waitcnt, Expcnt); + Waitcnt = encodeLgkmcnt(Version, Waitcnt, Lgkmcnt); + return Waitcnt; } unsigned getInitialPSInputAddr(const Function &F) { @@ -179,5 +327,135 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { return Reg; } +bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { + unsigned OpType = Desc.OpInfo[OpNo].OperandType; + return OpType >= AMDGPU::OPERAND_SRC_FIRST && + OpType <= AMDGPU::OPERAND_SRC_LAST; +} + +bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { + unsigned OpType = Desc.OpInfo[OpNo].OperandType; + switch (OpType) { + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + return true; + default: + return false; + } +} + +bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) { + unsigned OpType = Desc.OpInfo[OpNo].OperandType; + return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST && + OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST; +} + +// Avoid using MCRegisterClass::getSize, since that function will go away +// (move from MC* level to Target* level). Return size in bits. +unsigned getRegBitWidth(unsigned RCID) { + switch (RCID) { + case AMDGPU::SGPR_32RegClassID: + case AMDGPU::VGPR_32RegClassID: + case AMDGPU::VS_32RegClassID: + case AMDGPU::SReg_32RegClassID: + case AMDGPU::SReg_32_XM0RegClassID: + return 32; + case AMDGPU::SGPR_64RegClassID: + case AMDGPU::VS_64RegClassID: + case AMDGPU::SReg_64RegClassID: + case AMDGPU::VReg_64RegClassID: + return 64; + case AMDGPU::VReg_96RegClassID: + return 96; + case AMDGPU::SGPR_128RegClassID: + case AMDGPU::SReg_128RegClassID: + case AMDGPU::VReg_128RegClassID: + return 128; + case AMDGPU::SReg_256RegClassID: + case AMDGPU::VReg_256RegClassID: + return 256; + case AMDGPU::SReg_512RegClassID: + case AMDGPU::VReg_512RegClassID: + return 512; + default: + llvm_unreachable("Unexpected register class"); + } +} + +unsigned getRegBitWidth(const MCRegisterClass &RC) { + return getRegBitWidth(RC.getID()); +} + +unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, + unsigned OpNo) { + unsigned RCID = Desc.OpInfo[OpNo].RegClass; + return getRegBitWidth(MRI->getRegClass(RCID)) / 8; +} + +bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) { + if (Literal >= -16 && Literal <= 64) + return true; + + uint64_t Val = static_cast<uint64_t>(Literal); + return (Val == DoubleToBits(0.0)) || + (Val == DoubleToBits(1.0)) || + (Val == DoubleToBits(-1.0)) || + (Val == DoubleToBits(0.5)) || + (Val == DoubleToBits(-0.5)) || + (Val == DoubleToBits(2.0)) || + (Val == DoubleToBits(-2.0)) || + (Val == DoubleToBits(4.0)) || + (Val == DoubleToBits(-4.0)) || + (Val == 0x3fc45f306dc9c882 && HasInv2Pi); +} + +bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) { + if (Literal >= -16 && Literal <= 64) + return true; + + // The actual type of the operand does not seem to matter as long + // as the bits match one of the inline immediate values. For example: + // + // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, + // so it is a legal inline immediate. + // + // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in + // floating-point, so it is a legal inline immediate. + + uint32_t Val = static_cast<uint32_t>(Literal); + return (Val == FloatToBits(0.0f)) || + (Val == FloatToBits(1.0f)) || + (Val == FloatToBits(-1.0f)) || + (Val == FloatToBits(0.5f)) || + (Val == FloatToBits(-0.5f)) || + (Val == FloatToBits(2.0f)) || + (Val == FloatToBits(-2.0f)) || + (Val == FloatToBits(4.0f)) || + (Val == FloatToBits(-4.0f)) || + (Val == 0x3e22f983 && HasInv2Pi); +} + +bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { + assert(HasInv2Pi); + + if (Literal >= -16 && Literal <= 64) + return true; + + uint16_t Val = static_cast<uint16_t>(Literal); + return Val == 0x3C00 || // 1.0 + Val == 0xBC00 || // -1.0 + Val == 0x3800 || // 0.5 + Val == 0xB800 || // -0.5 + Val == 0x4000 || // 2.0 + Val == 0xC000 || // -2.0 + Val == 0x4400 || // 4.0 + Val == 0xC400 || // -4.0 + Val == 0x3118; // 1/2pi +} + } // End namespace AMDGPU } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 995a904..ea5fc36 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -13,17 +13,29 @@ #include "AMDKernelCodeT.h" #include "llvm/IR/CallingConv.h" +#include "SIDefines.h" + +#define GET_INSTRINFO_OPERAND_ENUM +#include "AMDGPUGenInstrInfo.inc" +#undef GET_INSTRINFO_OPERAND_ENUM + namespace llvm { class FeatureBitset; class Function; class GlobalValue; class MCContext; +class MCInstrDesc; +class MCRegisterClass; +class MCRegisterInfo; class MCSection; class MCSubtargetInfo; namespace AMDGPU { +LLVM_READONLY +int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); + struct IsaVersion { unsigned Major; unsigned Minor; @@ -45,9 +57,86 @@ bool isGroupSegment(const GlobalValue *GV); bool isGlobalSegment(const GlobalValue *GV); bool isReadOnlySegment(const GlobalValue *GV); +/// \returns True if constants should be emitted to .text section for given +/// target triple \p TT, false otherwise. +bool shouldEmitConstantsToTextSection(const Triple &TT); + +/// \returns Integer value requested using \p F's \p Name attribute. +/// +/// \returns \p Default if attribute is not present. +/// +/// \returns \p Default and emits error if requested value cannot be converted +/// to integer. int getIntegerAttribute(const Function &F, StringRef Name, int Default); -unsigned getMaximumWorkGroupSize(const Function &F); +/// \returns A pair of integer values requested using \p F's \p Name attribute +/// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired +/// is false). +/// +/// \returns \p Default if attribute is not present. +/// +/// \returns \p Default and emits error if one of the requested values cannot be +/// converted to integer, or \p OnlyFirstRequired is false and "second" value is +/// not present. +std::pair<int, int> getIntegerPairAttribute(const Function &F, + StringRef Name, + std::pair<int, int> Default, + bool OnlyFirstRequired = false); + +/// \returns Waitcnt bit mask for given isa \p Version. +unsigned getWaitcntBitMask(IsaVersion Version); + +/// \returns Vmcnt bit mask for given isa \p Version. +unsigned getVmcntBitMask(IsaVersion Version); + +/// \returns Expcnt bit mask for given isa \p Version. +unsigned getExpcntBitMask(IsaVersion Version); + +/// \returns Lgkmcnt bit mask for given isa \p Version. +unsigned getLgkmcntBitMask(IsaVersion Version); + +/// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version. +unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt); + +/// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version. +unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt); + +/// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version. +unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt); + +/// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa +/// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and +/// \p Lgkmcnt respectively. +/// +/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows: +/// \p Vmcnt = \p Waitcnt[3:0] +/// \p Expcnt = \p Waitcnt[6:4] +/// \p Lgkmcnt = \p Waitcnt[11:8] +void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt, + unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt); + +/// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version. +unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt); + +/// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version. +unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt); + +/// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version. +unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt); + +/// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa +/// \p Version. +/// +/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows: +/// Waitcnt[3:0] = \p Vmcnt +/// Waitcnt[6:4] = \p Expcnt +/// Waitcnt[11:8] = \p Lgkmcnt +/// +/// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given +/// isa \p Version. +unsigned encodeWaitcnt(IsaVersion Version, + unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt); + unsigned getInitialPSInputAddr(const Function &F); bool isShader(CallingConv::ID cc); @@ -61,6 +150,66 @@ bool isVI(const MCSubtargetInfo &STI); /// \p STI otherwise return \p Reg. unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI); +/// \brief Can this operand also contain immediate values? +bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo); + +/// \brief Is this floating-point operand? +bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo); + +/// \brief Does this opearnd support only inlinable literals? +bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo); + +/// \brief Get the size in bits of a register from the register class \p RC. +unsigned getRegBitWidth(unsigned RCID); + +/// \brief Get the size in bits of a register from the register class \p RC. +unsigned getRegBitWidth(const MCRegisterClass &RC); + +/// \brief Get size of register operand +unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, + unsigned OpNo); + +LLVM_READNONE +inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { + switch (OpInfo.OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + return 4; + + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + return 8; + + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + return 2; + + default: + llvm_unreachable("unhandled operand type"); + } +} + +LLVM_READNONE +inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) { + return getOperandSize(Desc.OpInfo[OpNo]); +} + +/// \brief Is this literal inlinable +LLVM_READNONE +bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi); + +LLVM_READNONE +bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi); + +LLVM_READNONE +bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi); + } // end namespace AMDGPU } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h index 3a5ff60..c55eaab 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h @@ -16,10 +16,10 @@ #define QNAME(name) amd_kernel_code_t::name #define FLD_T(name) decltype(QNAME(name)), &QNAME(name) -#define FIELD2(sname, name) \ - RECORD(sname, printField<FLD_T(name)>, parseField<FLD_T(name)>) +#define FIELD2(sname, aname, name) \ + RECORD(sname, aname, printField<FLD_T(name)>, parseField<FLD_T(name)>) -#define FIELD(name) FIELD2(name, name) +#define FIELD(name) FIELD2(name, name, name) #define PRINTCODEPROP(name) \ @@ -33,7 +33,7 @@ AMD_CODE_PROPERTY_##name##_WIDTH> #define CODEPROP(name, shift) \ - RECORD(name, PRINTCODEPROP(shift), PARSECODEPROP(shift)) + RECORD(name, name, PRINTCODEPROP(shift), PARSECODEPROP(shift)) // have to define these lambdas because of Set/GetMacro #define PRINTCOMP(GetMacro, Shift) \ @@ -50,32 +50,70 @@ return true; \ } -#define COMPPGM(name, GetMacro, SetMacro, Shift) \ - RECORD(name, PRINTCOMP(GetMacro, Shift), PARSECOMP(SetMacro, Shift)) +#define COMPPGM(name, aname, GetMacro, SetMacro, Shift) \ + RECORD(name, aname, PRINTCOMP(GetMacro, Shift), PARSECOMP(SetMacro, Shift)) -#define COMPPGM1(name, AccMacro) \ - COMPPGM(compute_pgm_rsrc1_##name, \ - G_00B848_##AccMacro, S_00B848_##AccMacro, 0) +#define COMPPGM1(name, aname, AccMacro) \ + COMPPGM(name, aname, G_00B848_##AccMacro, S_00B848_##AccMacro, 0) -#define COMPPGM2(name, AccMacro) \ - COMPPGM(compute_pgm_rsrc2_##name, \ - G_00B84C_##AccMacro, S_00B84C_##AccMacro, 32) +#define COMPPGM2(name, aname, AccMacro) \ + COMPPGM(name, aname, G_00B84C_##AccMacro, S_00B84C_##AccMacro, 32) /////////////////////////////////////////////////////////////////////////////// // Begin of the table // Define RECORD(name, print, parse) in your code to get field definitions // and include this file -FIELD2(kernel_code_version_major, amd_kernel_code_version_major), -FIELD2(kernel_code_version_minor, amd_kernel_code_version_minor), -FIELD2(machine_kind, amd_machine_kind), -FIELD2(machine_version_major, amd_machine_version_major), -FIELD2(machine_version_minor, amd_machine_version_minor), -FIELD2(machine_version_stepping, amd_machine_version_stepping), +FIELD2(amd_code_version_major, kernel_code_version_major, amd_kernel_code_version_major), +FIELD2(amd_code_version_minor, kernel_code_version_minor, amd_kernel_code_version_minor), +FIELD2(amd_machine_kind, machine_kind, amd_machine_kind), +FIELD2(amd_machine_version_major, machine_version_major, amd_machine_version_major), +FIELD2(amd_machine_version_minor, machine_version_minor, amd_machine_version_minor), +FIELD2(amd_machine_version_stepping, machine_version_stepping, amd_machine_version_stepping), + FIELD(kernel_code_entry_byte_offset), FIELD(kernel_code_prefetch_byte_size), FIELD(max_scratch_backing_memory_byte_size), -FIELD(compute_pgm_resource_registers), + +COMPPGM1(granulated_workitem_vgpr_count, compute_pgm_rsrc1_vgprs, VGPRS), +COMPPGM1(granulated_wavefront_sgpr_count, compute_pgm_rsrc1_sgprs, SGPRS), +COMPPGM1(priority, compute_pgm_rsrc1_priority, PRIORITY), +COMPPGM1(float_mode, compute_pgm_rsrc1_float_mode, FLOAT_MODE), // TODO: split float_mode +COMPPGM1(priv, compute_pgm_rsrc1_priv, PRIV), +COMPPGM1(enable_dx10_clamp, compute_pgm_rsrc1_dx10_clamp, DX10_CLAMP), +COMPPGM1(debug_mode, compute_pgm_rsrc1_debug_mode, DEBUG_MODE), +COMPPGM1(enable_ieee_mode, compute_pgm_rsrc1_ieee_mode, IEEE_MODE), +// TODO: bulky +// TODO: cdbg_user +COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN), +COMPPGM2(user_sgpr_count, compute_pgm_rsrc2_user_sgpr, USER_SGPR), +// TODO: enable_trap_handler +COMPPGM2(enable_sgpr_workgroup_id_x, compute_pgm_rsrc2_tgid_x_en, TGID_X_EN), +COMPPGM2(enable_sgpr_workgroup_id_y, compute_pgm_rsrc2_tgid_y_en, TGID_Y_EN), +COMPPGM2(enable_sgpr_workgroup_id_z, compute_pgm_rsrc2_tgid_z_en, TGID_Z_EN), +COMPPGM2(enable_sgpr_workgroup_info, compute_pgm_rsrc2_tg_size_en, TG_SIZE_EN), +COMPPGM2(enable_vgpr_workitem_id, compute_pgm_rsrc2_tidig_comp_cnt, TIDIG_COMP_CNT), +COMPPGM2(enable_exception_msb, compute_pgm_rsrc2_excp_en_msb, EXCP_EN_MSB), // TODO: split enable_exception_msb +COMPPGM2(granulated_lds_size, compute_pgm_rsrc2_lds_size, LDS_SIZE), +COMPPGM2(enable_exception, compute_pgm_rsrc2_excp_en, EXCP_EN), // TODO: split enable_exception + +CODEPROP(enable_sgpr_private_segment_buffer, ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER), +CODEPROP(enable_sgpr_dispatch_ptr, ENABLE_SGPR_DISPATCH_PTR), +CODEPROP(enable_sgpr_queue_ptr, ENABLE_SGPR_QUEUE_PTR), +CODEPROP(enable_sgpr_kernarg_segment_ptr, ENABLE_SGPR_KERNARG_SEGMENT_PTR), +CODEPROP(enable_sgpr_dispatch_id, ENABLE_SGPR_DISPATCH_ID), +CODEPROP(enable_sgpr_flat_scratch_init, ENABLE_SGPR_FLAT_SCRATCH_INIT), +CODEPROP(enable_sgpr_private_segment_size, ENABLE_SGPR_PRIVATE_SEGMENT_SIZE), +CODEPROP(enable_sgpr_grid_workgroup_count_x, ENABLE_SGPR_GRID_WORKGROUP_COUNT_X), +CODEPROP(enable_sgpr_grid_workgroup_count_y, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y), +CODEPROP(enable_sgpr_grid_workgroup_count_z, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z), +CODEPROP(enable_ordered_append_gds, ENABLE_ORDERED_APPEND_GDS), +CODEPROP(private_element_size, PRIVATE_ELEMENT_SIZE), +CODEPROP(is_ptr64, IS_PTR64), +CODEPROP(is_dynamic_callstack, IS_DYNAMIC_CALLSTACK), +CODEPROP(is_debug_enabled, IS_DEBUG_SUPPORTED), +CODEPROP(is_xnack_enabled, IS_XNACK_SUPPORTED), + FIELD(workitem_private_segment_byte_size), FIELD(workgroup_group_segment_byte_size), FIELD(gds_segment_byte_size), @@ -94,59 +132,8 @@ FIELD(group_segment_alignment), FIELD(private_segment_alignment), FIELD(wavefront_size), FIELD(call_convention), -FIELD(runtime_loader_kernel_symbol), - -COMPPGM1(vgprs, VGPRS), -COMPPGM1(sgprs, SGPRS), -COMPPGM1(priority, PRIORITY), -COMPPGM1(float_mode, FLOAT_MODE), -COMPPGM1(priv, PRIV), -COMPPGM1(dx10_clamp, DX10_CLAMP), -COMPPGM1(debug_mode, DEBUG_MODE), -COMPPGM1(ieee_mode, IEEE_MODE), -COMPPGM2(scratch_en, SCRATCH_EN), -COMPPGM2(user_sgpr, USER_SGPR), -COMPPGM2(tgid_x_en, TGID_X_EN), -COMPPGM2(tgid_y_en, TGID_Y_EN), -COMPPGM2(tgid_z_en, TGID_Z_EN), -COMPPGM2(tg_size_en, TG_SIZE_EN), -COMPPGM2(tidig_comp_cnt, TIDIG_COMP_CNT), -COMPPGM2(excp_en_msb, EXCP_EN_MSB), -COMPPGM2(lds_size, LDS_SIZE), -COMPPGM2(excp_en, EXCP_EN), - -CODEPROP(enable_sgpr_private_segment_buffer, - ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER), -CODEPROP(enable_sgpr_dispatch_ptr, - ENABLE_SGPR_DISPATCH_PTR), -CODEPROP(enable_sgpr_queue_ptr, - ENABLE_SGPR_QUEUE_PTR), -CODEPROP(enable_sgpr_kernarg_segment_ptr, - ENABLE_SGPR_KERNARG_SEGMENT_PTR), -CODEPROP(enable_sgpr_dispatch_id, - ENABLE_SGPR_DISPATCH_ID), -CODEPROP(enable_sgpr_flat_scratch_init, - ENABLE_SGPR_FLAT_SCRATCH_INIT), -CODEPROP(enable_sgpr_private_segment_size, - ENABLE_SGPR_PRIVATE_SEGMENT_SIZE), -CODEPROP(enable_sgpr_grid_workgroup_count_x, - ENABLE_SGPR_GRID_WORKGROUP_COUNT_X), -CODEPROP(enable_sgpr_grid_workgroup_count_y, - ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y), -CODEPROP(enable_sgpr_grid_workgroup_count_z, - ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z), -CODEPROP(enable_ordered_append_gds, - ENABLE_ORDERED_APPEND_GDS), -CODEPROP(private_element_size, - PRIVATE_ELEMENT_SIZE), -CODEPROP(is_ptr64, - IS_PTR64), -CODEPROP(is_dynamic_callstack, - IS_DYNAMIC_CALLSTACK), -CODEPROP(is_debug_enabled, - IS_DEBUG_SUPPORTED), -CODEPROP(is_xnack_enabled, - IS_XNACK_SUPPORTED) +FIELD(runtime_loader_kernel_symbol) +// TODO: control_directive // end of the table /////////////////////////////////////////////////////////////////////////////// diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp index f64973a..0333b0a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp @@ -24,22 +24,37 @@ using namespace llvm; static ArrayRef<StringRef> get_amd_kernel_code_t_FldNames() { static StringRef const Table[] = { "", // not found placeholder -#define RECORD(name, print, parse) #name +#define RECORD(name, altName, print, parse) #name #include "AMDKernelCodeTInfo.h" #undef RECORD }; return makeArrayRef(Table); } -static StringMap<int> createIndexMap(const ArrayRef<StringRef> &a) { +static ArrayRef<StringRef> get_amd_kernel_code_t_FldAltNames() { + static StringRef const Table[] = { + "", // not found placeholder +#define RECORD(name, altName, print, parse) #altName +#include "AMDKernelCodeTInfo.h" +#undef RECORD + }; + return makeArrayRef(Table); +} + +static StringMap<int> createIndexMap(const ArrayRef<StringRef> &names, + const ArrayRef<StringRef> &altNames) { StringMap<int> map; - for (auto Name : a) - map.insert(std::make_pair(Name, map.size())); + assert(names.size() == altNames.size()); + for (unsigned i = 0; i < names.size(); ++i) { + map.insert(std::make_pair(names[i], i)); + map.insert(std::make_pair(altNames[i], i)); + } return map; } static int get_amd_kernel_code_t_FieldIndex(StringRef name) { - static const auto map = createIndexMap(get_amd_kernel_code_t_FldNames()); + static const auto map = createIndexMap(get_amd_kernel_code_t_FldNames(), + get_amd_kernel_code_t_FldAltNames()); return map.lookup(name) - 1; // returns -1 if not found } @@ -73,7 +88,7 @@ typedef void(*PrintFx)(StringRef, static ArrayRef<PrintFx> getPrinterTable() { static const PrintFx Table[] = { -#define RECORD(name, print, parse) print +#define RECORD(name, altName, print, parse) print #include "AMDKernelCodeTInfo.h" #undef RECORD }; @@ -145,7 +160,7 @@ typedef bool(*ParseFx)(amd_kernel_code_t &, static ArrayRef<ParseFx> getParserTable() { static const ParseFx Table[] = { -#define RECORD(name, print, parse) parse +#define RECORD(name, altName, print, parse) parse #include "AMDKernelCodeTInfo.h" #undef RECORD }; diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td index 912ed53..1fd1c1e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td +++ b/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td @@ -11,283 +11,6 @@ // //===----------------------------------------------------------------------===// -class DSe_vi <bits<8> op> : Enc64 { - bits<8> vdst; - bits<1> gds; - bits<8> addr; - bits<8> data0; - bits<8> data1; - bits<8> offset0; - bits<8> offset1; - - let Inst{7-0} = offset0; - let Inst{15-8} = offset1; - let Inst{16} = gds; - let Inst{24-17} = op; - let Inst{31-26} = 0x36; //encoding - let Inst{39-32} = addr; - let Inst{47-40} = data0; - let Inst{55-48} = data1; - let Inst{63-56} = vdst; -} - -class MUBUFe_vi <bits<7> op> : Enc64 { - bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<1> lds; - bits<8> vaddr; - bits<8> vdata; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{16} = lds; - let Inst{17} = slc; - let Inst{24-18} = op; - let Inst{31-26} = 0x38; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{55} = tfe; - let Inst{63-56} = soffset; -} - -class MTBUFe_vi <bits<4> op> : Enc64 { - bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<4> dfmt; - bits<3> nfmt; - bits<8> vaddr; - bits<8> vdata; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{18-15} = op; - let Inst{22-19} = dfmt; - let Inst{25-23} = nfmt; - let Inst{31-26} = 0x3a; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{54} = slc; - let Inst{55} = tfe; - let Inst{63-56} = soffset; -} - -class SMEMe_vi <bits<8> op, bit imm> : Enc64 { - bits<7> sbase; - bits<7> sdst; - bits<1> glc; - - let Inst{5-0} = sbase{6-1}; - let Inst{12-6} = sdst; - let Inst{16} = glc; - let Inst{17} = imm; - let Inst{25-18} = op; - let Inst{31-26} = 0x30; //encoding -} - -class SMEM_IMMe_vi <bits<8> op> : SMEMe_vi<op, 1> { - bits<20> offset; - let Inst{51-32} = offset; -} - -class SMEM_SOFFe_vi <bits<8> op> : SMEMe_vi<op, 0> { - bits<20> soff; - let Inst{51-32} = soff; -} - -class VOP3a_vi <bits<10> op> : Enc64 { - bits<2> src0_modifiers; - bits<9> src0; - bits<2> src1_modifiers; - bits<9> src1; - bits<2> src2_modifiers; - bits<9> src2; - bits<1> clamp; - bits<2> omod; - - let Inst{8} = src0_modifiers{1}; - let Inst{9} = src1_modifiers{1}; - let Inst{10} = src2_modifiers{1}; - let Inst{15} = clamp; - let Inst{25-16} = op; - let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = src0; - let Inst{49-41} = src1; - let Inst{58-50} = src2; - let Inst{60-59} = omod; - let Inst{61} = src0_modifiers{0}; - let Inst{62} = src1_modifiers{0}; - let Inst{63} = src2_modifiers{0}; -} - -class VOP3e_vi <bits<10> op> : VOP3a_vi <op> { - bits<8> vdst; - - let Inst{7-0} = vdst; -} - -// Encoding used for VOPC instructions encoded as VOP3 -// Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst -class VOP3ce_vi <bits<10> op> : VOP3a_vi <op> { - bits<8> sdst; - - let Inst{7-0} = sdst; -} - -class VOP3be_vi <bits<10> op> : Enc64 { - bits<8> vdst; - bits<2> src0_modifiers; - bits<9> src0; - bits<2> src1_modifiers; - bits<9> src1; - bits<2> src2_modifiers; - bits<9> src2; - bits<7> sdst; - bits<2> omod; - bits<1> clamp; - - let Inst{7-0} = vdst; - let Inst{14-8} = sdst; - let Inst{15} = clamp; - let Inst{25-16} = op; - let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = src0; - let Inst{49-41} = src1; - let Inst{58-50} = src2; - let Inst{60-59} = omod; - let Inst{61} = src0_modifiers{0}; - let Inst{62} = src1_modifiers{0}; - let Inst{63} = src2_modifiers{0}; -} - -class VOP_DPP <dag outs, dag ins, string asm, list<dag> pattern, bit HasMods = 0> : - VOPAnyCommon <outs, ins, asm, pattern> { - let DPP = 1; - let Size = 8; - - let AsmMatchConverter = !if(!eq(HasMods,1), "cvtDPP", ""); -} - -class VOP_DPPe : Enc64 { - bits<2> src0_modifiers; - bits<8> src0; - bits<2> src1_modifiers; - bits<9> dpp_ctrl; - bits<1> bound_ctrl; - bits<4> bank_mask; - bits<4> row_mask; - - let Inst{39-32} = src0; - let Inst{48-40} = dpp_ctrl; - let Inst{51} = bound_ctrl; - let Inst{52} = src0_modifiers{0}; // src0_neg - let Inst{53} = src0_modifiers{1}; // src0_abs - let Inst{54} = src1_modifiers{0}; // src1_neg - let Inst{55} = src1_modifiers{1}; // src1_abs - let Inst{59-56} = bank_mask; - let Inst{63-60} = row_mask; -} - -class VOP1_DPPe <bits<8> op> : VOP_DPPe { - bits<8> vdst; - - let Inst{8-0} = 0xfa; // dpp - let Inst{16-9} = op; - let Inst{24-17} = vdst; - let Inst{31-25} = 0x3f; //encoding -} - -class VOP2_DPPe <bits<6> op> : VOP_DPPe { - bits<8> vdst; - bits<8> src1; - - let Inst{8-0} = 0xfa; //dpp - let Inst{16-9} = src1; - let Inst{24-17} = vdst; - let Inst{30-25} = op; - let Inst{31} = 0x0; //encoding -} - -class VOP_SDWA <dag outs, dag ins, string asm, list<dag> pattern, bit HasMods = 0> : - VOPAnyCommon <outs, ins, asm, pattern> { - let SDWA = 1; - let Size = 8; -} - -class VOP_SDWAe : Enc64 { - bits<8> src0; - bits<3> src0_sel; - bits<2> src0_fmodifiers; // {abs,neg} - bits<1> src0_imodifiers; // sext - bits<3> src1_sel; - bits<2> src1_fmodifiers; - bits<1> src1_imodifiers; - bits<3> dst_sel; - bits<2> dst_unused; - bits<1> clamp; - - let Inst{39-32} = src0; - let Inst{42-40} = dst_sel; - let Inst{44-43} = dst_unused; - let Inst{45} = clamp; - let Inst{50-48} = src0_sel; - let Inst{53-52} = src0_fmodifiers; - let Inst{51} = src0_imodifiers; - let Inst{58-56} = src1_sel; - let Inst{61-60} = src1_fmodifiers; - let Inst{59} = src1_imodifiers; -} - -class VOP1_SDWAe <bits<8> op> : VOP_SDWAe { - bits<8> vdst; - - let Inst{8-0} = 0xf9; // sdwa - let Inst{16-9} = op; - let Inst{24-17} = vdst; - let Inst{31-25} = 0x3f; // encoding -} - -class VOP2_SDWAe <bits<6> op> : VOP_SDWAe { - bits<8> vdst; - bits<8> src1; - - let Inst{8-0} = 0xf9; // sdwa - let Inst{16-9} = src1; - let Inst{24-17} = vdst; - let Inst{30-25} = op; - let Inst{31} = 0x0; // encoding -} - -class VOPC_SDWAe <bits<8> op> : VOP_SDWAe { - bits<8> src1; - - let Inst{8-0} = 0xf9; // sdwa - let Inst{16-9} = src1; - let Inst{24-17} = op; - let Inst{31-25} = 0x3e; // encoding - - // VOPC disallows dst_sel and dst_unused as they have no effect on destination - let Inst{42-40} = 0x6; - let Inst{44-43} = 0x2; -} - class EXPe_vi : EXPe { let Inst{31-26} = 0x31; //encoding } diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td index 5c490ab..b45c8fc 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td @@ -9,150 +9,6 @@ // Instruction definitions for VI and newer. //===----------------------------------------------------------------------===// -let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in { - -let DisableSIDecoder = 1 in { - -//===----------------------------------------------------------------------===// -// VOP1 Instructions -//===----------------------------------------------------------------------===// - -defm V_CVT_F16_U16 : VOP1Inst <vop1<0, 0x39>, "v_cvt_f16_u16", VOP_F16_I16>; -defm V_CVT_F16_I16 : VOP1Inst <vop1<0, 0x3a>, "v_cvt_f16_i16", VOP_F16_I16>; -defm V_CVT_U16_F16 : VOP1Inst <vop1<0, 0x3b>, "v_cvt_u16_f16", VOP_I16_F16>; -defm V_CVT_I16_F16 : VOP1Inst <vop1<0, 0x3c>, "v_cvt_i16_f16", VOP_I16_F16>; -defm V_RCP_F16 : VOP1Inst <vop1<0, 0x3d>, "v_rcp_f16", VOP_F16_F16>; -defm V_SQRT_F16 : VOP1Inst <vop1<0, 0x3e>, "v_sqrt_f16", VOP_F16_F16>; -defm V_RSQ_F16 : VOP1Inst <vop1<0, 0x3f>, "v_rsq_f16", VOP_F16_F16>; -defm V_LOG_F16 : VOP1Inst <vop1<0, 0x40>, "v_log_f16", VOP_F16_F16>; -defm V_EXP_F16 : VOP1Inst <vop1<0, 0x41>, "v_exp_f16", VOP_F16_F16>; -defm V_FREXP_MANT_F16 : VOP1Inst <vop1<0, 0x42>, "v_frexp_mant_f16", - VOP_F16_F16 ->; -defm V_FREXP_EXP_I16_F16 : VOP1Inst <vop1<0, 0x43>, "v_frexp_exp_i16_f16", - VOP_I16_F16 ->; -defm V_FLOOR_F16 : VOP1Inst <vop1<0, 0x44>, "v_floor_f16", VOP_F16_F16>; -defm V_CEIL_F16 : VOP1Inst <vop1<0, 0x45>, "v_ceil_f16", VOP_F16_F16>; -defm V_TRUNC_F16 : VOP1Inst <vop1<0, 0x46>, "v_trunc_f16", VOP_F16_F16>; -defm V_RNDNE_F16 : VOP1Inst <vop1<0, 0x47>, "v_rndne_f16", VOP_F16_F16>; -defm V_FRACT_F16 : VOP1Inst <vop1<0, 0x48>, "v_fract_f16", VOP_F16_F16>; -defm V_SIN_F16 : VOP1Inst <vop1<0, 0x49>, "v_sin_f16", VOP_F16_F16>; -defm V_COS_F16 : VOP1Inst <vop1<0, 0x4a>, "v_cos_f16", VOP_F16_F16>; - -//===----------------------------------------------------------------------===// -// VOP2 Instructions -//===----------------------------------------------------------------------===// - -let isCommutable = 1 in { - -defm V_ADD_F16 : VOP2Inst <vop2<0, 0x1f>, "v_add_f16", VOP_F16_F16_F16>; -defm V_SUB_F16 : VOP2Inst <vop2<0, 0x20>, "v_sub_f16", VOP_F16_F16_F16>; -defm V_SUBREV_F16 : VOP2Inst <vop2<0, 0x21>, "v_subrev_f16", VOP_F16_F16_F16, - null_frag, "v_sub_f16" ->; -defm V_MUL_F16 : VOP2Inst <vop2<0, 0x22>, "v_mul_f16", VOP_F16_F16_F16>; -defm V_MAC_F16 : VOP2Inst <vop2<0, 0x23>, "v_mac_f16", VOP_F16_F16_F16>; -} // End isCommutable = 1 -defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16", VOP_MADMK>; -let isCommutable = 1 in { -defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16", VOP_MADAK>; -defm V_ADD_U16 : VOP2Inst <vop2<0,0x26>, "v_add_u16", VOP_I16_I16_I16>; -defm V_SUB_U16 : VOP2Inst <vop2<0,0x27>, "v_sub_u16" , VOP_I16_I16_I16>; -defm V_SUBREV_U16 : VOP2Inst <vop2<0,0x28>, "v_subrev_u16", VOP_I16_I16_I16>; -defm V_MUL_LO_U16 : VOP2Inst <vop2<0,0x29>, "v_mul_lo_u16", VOP_I16_I16_I16>; -} // End isCommutable = 1 -defm V_LSHLREV_B16 : VOP2Inst <vop2<0,0x2a>, "v_lshlrev_b16", VOP_I16_I16_I16>; -defm V_LSHRREV_B16 : VOP2Inst <vop2<0,0x2b>, "v_lshrrev_b16", VOP_I16_I16_I16>; -defm V_ASHRREV_B16 : VOP2Inst <vop2<0,0x2c>, "v_ashrrev_b16", VOP_I16_I16_I16>; -let isCommutable = 1 in { -defm V_MAX_F16 : VOP2Inst <vop2<0,0x2d>, "v_max_f16", VOP_F16_F16_F16>; -defm V_MIN_F16 : VOP2Inst <vop2<0,0x2e>, "v_min_f16", VOP_F16_F16_F16>; -defm V_MAX_U16 : VOP2Inst <vop2<0,0x2f>, "v_max_u16", VOP_I16_I16_I16>; -defm V_MAX_I16 : VOP2Inst <vop2<0,0x30>, "v_max_i16", VOP_I16_I16_I16>; -defm V_MIN_U16 : VOP2Inst <vop2<0,0x31>, "v_min_u16", VOP_I16_I16_I16>; -defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>; -} // End isCommutable = 1 -defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>; - -//===----------------------------------------------------------------------===// -// VOP3 Instructions -//===----------------------------------------------------------------------===// -let isCommutable = 1 in { - defm V_MAD_F16 : VOP3Inst <vop3<0, 0x1ea>, "v_mad_f16", VOP_F16_F16_F16_F16>; - defm V_MAD_U16 : VOP3Inst <vop3<0, 0x1eb>, "v_mad_u16", VOP_I16_I16_I16_I16>; - defm V_MAD_I16 : VOP3Inst <vop3<0, 0x1ec>, "v_mad_i16", VOP_I16_I16_I16_I16>; -} -} // let DisableSIDecoder = 1 - -// Aliases to simplify matching of floating-point instructions that -// are VOP2 on SI and VOP3 on VI. - -class SI2_VI3Alias <string name, Instruction inst> : InstAlias < - name#" $dst, $src0, $src1", - (inst VGPR_32:$dst, 0, VCSrc_32:$src0, 0, VCSrc_32:$src1, 0, 0) ->, PredicateControl { - let UseInstAsmMatchConverter = 0; -} - -def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>; -def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>; -def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; -def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; -def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; - -//===----------------------------------------------------------------------===// -// SMEM Instructions -//===----------------------------------------------------------------------===// - -def S_DCACHE_WB : SMEM_Inval <0x21, - "s_dcache_wb", int_amdgcn_s_dcache_wb>; - -def S_DCACHE_WB_VOL : SMEM_Inval <0x23, - "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>; - -def S_MEMREALTIME : SMEM_Ret<0x25, - "s_memrealtime", int_amdgcn_s_memrealtime>; - -} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI - -let Predicates = [isVI] in { - -// 1. Offset as 20bit DWORD immediate -def : Pat < - (SIload_constant v4i32:$sbase, IMM20bit:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) ->; - -//===----------------------------------------------------------------------===// -// DPP Patterns -//===----------------------------------------------------------------------===// - -def : Pat < - (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, - imm:$bound_ctrl), - (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), - (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) ->; - -//===----------------------------------------------------------------------===// -// Misc Patterns -//===----------------------------------------------------------------------===// - -def : Pat < - (i64 (readcyclecounter)), - (S_MEMREALTIME) ->; - -//===----------------------------------------------------------------------===// -// DS_PERMUTE/DS_BPERMUTE Instructions. -//===----------------------------------------------------------------------===// - -let Uses = [EXEC] in { -defm DS_PERMUTE_B32 : DS_1A1D_PERMUTE <0x3e, "ds_permute_b32", VGPR_32, - int_amdgcn_ds_permute>; -defm DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <0x3f, "ds_bpermute_b32", VGPR_32, - int_amdgcn_ds_bpermute>; -} - -} // End Predicates = [isVI] +FIXME: Deleting this file broke buildbots that don't do full rebuilds. This +file is no longer used by the backend, so it can be deleted once all +the buildbots update there dependencies. diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td new file mode 100644 index 0000000..8cae83c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -0,0 +1,615 @@ +//===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VOP1 Classes +//===----------------------------------------------------------------------===// + +class VOP1e <bits<8> op, VOPProfile P> : Enc32 { + bits<8> vdst; + bits<9> src0; + + let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, 0); + let Inst{16-9} = op; + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{31-25} = 0x3f; //encoding +} + +class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> { + bits<8> vdst; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = op; + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{31-25} = 0x3f; // encoding +} + +class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : + InstSI <P.Outs32, P.Ins32, "", pattern>, + VOP <opName>, + SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>, + MnemonicAlias<opName#"_e32", opName> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + string AsmOperands = P.Asm32; + + let Size = 4; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SubtargetPredicate = isGCN; + + let VOP1 = 1; + let VALU = 1; + let Uses = [EXEC]; + + let AsmVariantName = AMDGPUAsmVariants.Default; + + VOPProfile Pfl = P; +} + +class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + SIMCInstr <ps.PseudoInstr, EncodingFamily> { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let AsmVariantName = ps.AsmVariantName; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; +} + +class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_SDWA_Pseudo <OpName, P, pattern> { + let AsmMatchConverter = "cvtSdwaVOP1"; +} + +class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { + list<dag> ret = !if(P.HasModifiers, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + i32:$src0_modifiers, i1:$clamp, i32:$omod))))], + [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]); +} + +multiclass VOP1Inst <string opName, VOPProfile P, + SDPatternOperator node = null_frag> { + def _e32 : VOP1_Pseudo <opName, P>; + def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>; + def _sdwa : VOP1_SDWA_Pseudo <opName, P>; +} + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +let VOPAsmPrefer32Bit = 1 in { +defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>; +} + +let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { +defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>; +} // End isMoveImm = 1 + +// FIXME: Specify SchedRW for READFIRSTLANE_B32 +// TODO: Make profile for this, there is VOP3 encoding also +def V_READFIRSTLANE_B32 : + InstSI <(outs SReg_32:$vdst), + (ins VGPR_32:$src0), + "v_readfirstlane_b32 $vdst, $src0", + [(set i32:$vdst, (int_amdgcn_readfirstlane i32:$src0))]>, + Enc32 { + + let isCodeGenOnly = 0; + let UseNamedOperandTable = 1; + + let Size = 4; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SubtargetPredicate = isGCN; + + let VOP1 = 1; + let VALU = 1; + let Uses = [EXEC]; + let isConvergent = 1; + + bits<8> vdst; + bits<9> src0; + + let Inst{8-0} = src0; + let Inst{16-9} = 0x2; + let Inst{24-17} = vdst; + let Inst{31-25} = 0x3f; //encoding +} + +let SchedRW = [WriteQuarterRate32] in { +defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>; +defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP_F64_I32, sint_to_fp>; +defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>; +defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>; +defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; +defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>; +defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>; +defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>; +defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; +defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; +defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP_F32_I32>; +defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>; +defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>; +defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0>; +defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1>; +defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2>; +defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3>; +defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>; +defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP_F64_I32, uint_to_fp>; +} // End SchedRW = [WriteQuarterRate32] + +defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>; +defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>; +defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>; +defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>; +defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>; +defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>; + +let SchedRW = [WriteQuarterRate32] in { +defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>; +defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>; +defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>; +defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>; +} // End SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteDouble] in { +defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>; +defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>; +} // End SchedRW = [WriteDouble]; + +defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>; + +let SchedRW = [WriteDouble] in { +defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>; +} // End SchedRW = [WriteDouble] + +let SchedRW = [WriteQuarterRate32] in { +defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>; +defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>; +} // End SchedRW = [WriteQuarterRate32] + +defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; +defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>; +defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>; +defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>; +defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>; +defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>; + +let SchedRW = [WriteDoubleAdd] in { +defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>; +defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>; +} // End SchedRW = [WriteDoubleAdd] + +defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>; +defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>; + +let VOPAsmPrefer32Bit = 1 in { +defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>; +} + +// Restrict src0 to be VGPR +def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> { + let Src0RC32 = VRegSrc_32; + let Src0RC64 = VRegSrc_32; + + let HasExt = 0; +} + +// Special case because there are no true output operands. Hack vdst +// to be a src operand. The custom inserter must add a tied implicit +// def and use of the super register since there seems to be no way to +// add an implicit def of a virtual register in tablegen. +def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { + let Src0RC32 = VOPDstOperand<VGPR_32>; + let Src0RC64 = VOPDstOperand<VGPR_32>; + + let Outs = (outs); + let Ins32 = (ins Src0RC32:$vdst, VSrc_b32:$src0); + let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0); + let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel); + + let Asm32 = getAsm32<1, 1>.ret; + let Asm64 = getAsm64<1, 1, 0>.ret; + let AsmDPP = getAsmDPP<1, 1, 0>.ret; + let AsmSDWA = getAsmSDWA<1, 1, 0>.ret; + + let HasExt = 0; + let HasDst = 0; + let EmitDst = 1; // force vdst emission +} + +let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in { +// v_movreld_b32 is a special case because the destination output + // register is really a source. It isn't actually read (but may be + // written), and is only to provide the base register to start + // indexing from. Tablegen seems to not let you define an implicit + // virtual register output for the super register being written into, + // so this must have an implicit def of the register added to it. +defm V_MOVRELD_B32 : VOP1Inst <"v_movreld_b32", VOP_MOVRELD>; +defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>; +defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>; +} // End Uses = [M0, EXEC] + +// These instruction only exist on SI and CI +let SubtargetPredicate = isSICI in { + +let SchedRW = [WriteQuarterRate32] in { +defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>; +defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>; +defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>; +defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>; +defm V_RSQ_CLAMP_F32 : VOP1Inst <"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>; +defm V_RSQ_LEGACY_F32 : VOP1Inst <"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>; +} // End SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteDouble] in { +defm V_RCP_CLAMP_F64 : VOP1Inst <"v_rcp_clamp_f64", VOP_F64_F64>; +defm V_RSQ_CLAMP_F64 : VOP1Inst <"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>; +} // End SchedRW = [WriteDouble] + +} // End SubtargetPredicate = isSICI + + +let SubtargetPredicate = isCIVI in { + +let SchedRW = [WriteDoubleAdd] in { +defm V_TRUNC_F64 : VOP1Inst <"v_trunc_f64", VOP_F64_F64, ftrunc>; +defm V_CEIL_F64 : VOP1Inst <"v_ceil_f64", VOP_F64_F64, fceil>; +defm V_FLOOR_F64 : VOP1Inst <"v_floor_f64", VOP_F64_F64, ffloor>; +defm V_RNDNE_F64 : VOP1Inst <"v_rndne_f64", VOP_F64_F64, frint>; +} // End SchedRW = [WriteDoubleAdd] + +let SchedRW = [WriteQuarterRate32] in { +defm V_LOG_LEGACY_F32 : VOP1Inst <"v_log_legacy_f32", VOP_F32_F32>; +defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>; +} // End SchedRW = [WriteQuarterRate32] + +} // End SubtargetPredicate = isCIVI + + +let SubtargetPredicate = isVI in { + +defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>; +defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>; +defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; +defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; +defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; +defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>; +defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>; +defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>; +defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>; +defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; +defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>; +defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>; +defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>; +defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>; +defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>; +defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; +defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; +defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; + +} + +let Predicates = [isVI] in { + +def : Pat< + (f32 (f16_to_fp i16:$src)), + (V_CVT_F32_F16_e32 $src) +>; + +def : Pat< + (i16 (fp_to_f16 f32:$src)), + (V_CVT_F16_F32_e32 $src) +>; + +} + +//===----------------------------------------------------------------------===// +// Target +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SI +//===----------------------------------------------------------------------===// + +multiclass VOP1_Real_si <bits<9> op> { + let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in { + def _e32_si : + VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, + VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>; + def _e64_si : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + } +} + +defm V_NOP : VOP1_Real_si <0x0>; +defm V_MOV_B32 : VOP1_Real_si <0x1>; +defm V_CVT_I32_F64 : VOP1_Real_si <0x3>; +defm V_CVT_F64_I32 : VOP1_Real_si <0x4>; +defm V_CVT_F32_I32 : VOP1_Real_si <0x5>; +defm V_CVT_F32_U32 : VOP1_Real_si <0x6>; +defm V_CVT_U32_F32 : VOP1_Real_si <0x7>; +defm V_CVT_I32_F32 : VOP1_Real_si <0x8>; +defm V_MOV_FED_B32 : VOP1_Real_si <0x9>; +defm V_CVT_F16_F32 : VOP1_Real_si <0xa>; +defm V_CVT_F32_F16 : VOP1_Real_si <0xb>; +defm V_CVT_RPI_I32_F32 : VOP1_Real_si <0xc>; +defm V_CVT_FLR_I32_F32 : VOP1_Real_si <0xd>; +defm V_CVT_OFF_F32_I4 : VOP1_Real_si <0xe>; +defm V_CVT_F32_F64 : VOP1_Real_si <0xf>; +defm V_CVT_F64_F32 : VOP1_Real_si <0x10>; +defm V_CVT_F32_UBYTE0 : VOP1_Real_si <0x11>; +defm V_CVT_F32_UBYTE1 : VOP1_Real_si <0x12>; +defm V_CVT_F32_UBYTE2 : VOP1_Real_si <0x13>; +defm V_CVT_F32_UBYTE3 : VOP1_Real_si <0x14>; +defm V_CVT_U32_F64 : VOP1_Real_si <0x15>; +defm V_CVT_F64_U32 : VOP1_Real_si <0x16>; +defm V_FRACT_F32 : VOP1_Real_si <0x20>; +defm V_TRUNC_F32 : VOP1_Real_si <0x21>; +defm V_CEIL_F32 : VOP1_Real_si <0x22>; +defm V_RNDNE_F32 : VOP1_Real_si <0x23>; +defm V_FLOOR_F32 : VOP1_Real_si <0x24>; +defm V_EXP_F32 : VOP1_Real_si <0x25>; +defm V_LOG_CLAMP_F32 : VOP1_Real_si <0x26>; +defm V_LOG_F32 : VOP1_Real_si <0x27>; +defm V_RCP_CLAMP_F32 : VOP1_Real_si <0x28>; +defm V_RCP_LEGACY_F32 : VOP1_Real_si <0x29>; +defm V_RCP_F32 : VOP1_Real_si <0x2a>; +defm V_RCP_IFLAG_F32 : VOP1_Real_si <0x2b>; +defm V_RSQ_CLAMP_F32 : VOP1_Real_si <0x2c>; +defm V_RSQ_LEGACY_F32 : VOP1_Real_si <0x2d>; +defm V_RSQ_F32 : VOP1_Real_si <0x2e>; +defm V_RCP_F64 : VOP1_Real_si <0x2f>; +defm V_RCP_CLAMP_F64 : VOP1_Real_si <0x30>; +defm V_RSQ_F64 : VOP1_Real_si <0x31>; +defm V_RSQ_CLAMP_F64 : VOP1_Real_si <0x32>; +defm V_SQRT_F32 : VOP1_Real_si <0x33>; +defm V_SQRT_F64 : VOP1_Real_si <0x34>; +defm V_SIN_F32 : VOP1_Real_si <0x35>; +defm V_COS_F32 : VOP1_Real_si <0x36>; +defm V_NOT_B32 : VOP1_Real_si <0x37>; +defm V_BFREV_B32 : VOP1_Real_si <0x38>; +defm V_FFBH_U32 : VOP1_Real_si <0x39>; +defm V_FFBL_B32 : VOP1_Real_si <0x3a>; +defm V_FFBH_I32 : VOP1_Real_si <0x3b>; +defm V_FREXP_EXP_I32_F64 : VOP1_Real_si <0x3c>; +defm V_FREXP_MANT_F64 : VOP1_Real_si <0x3d>; +defm V_FRACT_F64 : VOP1_Real_si <0x3e>; +defm V_FREXP_EXP_I32_F32 : VOP1_Real_si <0x3f>; +defm V_FREXP_MANT_F32 : VOP1_Real_si <0x40>; +defm V_CLREXCP : VOP1_Real_si <0x41>; +defm V_MOVRELD_B32 : VOP1_Real_si <0x42>; +defm V_MOVRELS_B32 : VOP1_Real_si <0x43>; +defm V_MOVRELSD_B32 : VOP1_Real_si <0x44>; + +//===----------------------------------------------------------------------===// +// CI +//===----------------------------------------------------------------------===// + +multiclass VOP1_Real_ci <bits<9> op> { + let AssemblerPredicates = [isCIOnly], DecoderNamespace = "CI" in { + def _e32_ci : + VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, + VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>; + def _e64_ci : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + } +} + +defm V_TRUNC_F64 : VOP1_Real_ci <0x17>; +defm V_CEIL_F64 : VOP1_Real_ci <0x18>; +defm V_FLOOR_F64 : VOP1_Real_ci <0x1A>; +defm V_RNDNE_F64 : VOP1_Real_ci <0x19>; +defm V_LOG_LEGACY_F32 : VOP1_Real_ci <0x45>; +defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>; + +//===----------------------------------------------------------------------===// +// VI +//===----------------------------------------------------------------------===// + +class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> : + VOP_DPP <ps.OpName, P> { + let Defs = ps.Defs; + let Uses = ps.Uses; + let SchedRW = ps.SchedRW; + let hasSideEffects = ps.hasSideEffects; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + bits<8> vdst; + let Inst{8-0} = 0xfa; // dpp + let Inst{16-9} = op; + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{31-25} = 0x3f; //encoding +} + +multiclass VOP1_Real_vi <bits<10> op> { + let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { + def _e32_vi : + VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>, + VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>; + def _e64_vi : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, + VOP3e_vi <!add(0x140, op), !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + } + + def _sdwa_vi : + VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + + // For now left dpp only for asm/dasm + // TODO: add corresponding pseudo + def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; +} + +defm V_NOP : VOP1_Real_vi <0x0>; +defm V_MOV_B32 : VOP1_Real_vi <0x1>; +defm V_CVT_I32_F64 : VOP1_Real_vi <0x3>; +defm V_CVT_F64_I32 : VOP1_Real_vi <0x4>; +defm V_CVT_F32_I32 : VOP1_Real_vi <0x5>; +defm V_CVT_F32_U32 : VOP1_Real_vi <0x6>; +defm V_CVT_U32_F32 : VOP1_Real_vi <0x7>; +defm V_CVT_I32_F32 : VOP1_Real_vi <0x8>; +defm V_CVT_F16_F32 : VOP1_Real_vi <0xa>; +defm V_CVT_F32_F16 : VOP1_Real_vi <0xb>; +defm V_CVT_RPI_I32_F32 : VOP1_Real_vi <0xc>; +defm V_CVT_FLR_I32_F32 : VOP1_Real_vi <0xd>; +defm V_CVT_OFF_F32_I4 : VOP1_Real_vi <0xe>; +defm V_CVT_F32_F64 : VOP1_Real_vi <0xf>; +defm V_CVT_F64_F32 : VOP1_Real_vi <0x10>; +defm V_CVT_F32_UBYTE0 : VOP1_Real_vi <0x11>; +defm V_CVT_F32_UBYTE1 : VOP1_Real_vi <0x12>; +defm V_CVT_F32_UBYTE2 : VOP1_Real_vi <0x13>; +defm V_CVT_F32_UBYTE3 : VOP1_Real_vi <0x14>; +defm V_CVT_U32_F64 : VOP1_Real_vi <0x15>; +defm V_CVT_F64_U32 : VOP1_Real_vi <0x16>; +defm V_FRACT_F32 : VOP1_Real_vi <0x1b>; +defm V_TRUNC_F32 : VOP1_Real_vi <0x1c>; +defm V_CEIL_F32 : VOP1_Real_vi <0x1d>; +defm V_RNDNE_F32 : VOP1_Real_vi <0x1e>; +defm V_FLOOR_F32 : VOP1_Real_vi <0x1f>; +defm V_EXP_F32 : VOP1_Real_vi <0x20>; +defm V_LOG_F32 : VOP1_Real_vi <0x21>; +defm V_RCP_F32 : VOP1_Real_vi <0x22>; +defm V_RCP_IFLAG_F32 : VOP1_Real_vi <0x23>; +defm V_RSQ_F32 : VOP1_Real_vi <0x24>; +defm V_RCP_F64 : VOP1_Real_vi <0x25>; +defm V_RSQ_F64 : VOP1_Real_vi <0x26>; +defm V_SQRT_F32 : VOP1_Real_vi <0x27>; +defm V_SQRT_F64 : VOP1_Real_vi <0x28>; +defm V_SIN_F32 : VOP1_Real_vi <0x29>; +defm V_COS_F32 : VOP1_Real_vi <0x2a>; +defm V_NOT_B32 : VOP1_Real_vi <0x2b>; +defm V_BFREV_B32 : VOP1_Real_vi <0x2c>; +defm V_FFBH_U32 : VOP1_Real_vi <0x2d>; +defm V_FFBL_B32 : VOP1_Real_vi <0x2e>; +defm V_FFBH_I32 : VOP1_Real_vi <0x2f>; +defm V_FREXP_EXP_I32_F64 : VOP1_Real_vi <0x30>; +defm V_FREXP_MANT_F64 : VOP1_Real_vi <0x31>; +defm V_FRACT_F64 : VOP1_Real_vi <0x32>; +defm V_FREXP_EXP_I32_F32 : VOP1_Real_vi <0x33>; +defm V_FREXP_MANT_F32 : VOP1_Real_vi <0x34>; +defm V_CLREXCP : VOP1_Real_vi <0x35>; +defm V_MOVRELD_B32 : VOP1_Real_vi <0x36>; +defm V_MOVRELS_B32 : VOP1_Real_vi <0x37>; +defm V_MOVRELSD_B32 : VOP1_Real_vi <0x38>; +defm V_TRUNC_F64 : VOP1_Real_vi <0x17>; +defm V_CEIL_F64 : VOP1_Real_vi <0x18>; +defm V_FLOOR_F64 : VOP1_Real_vi <0x1A>; +defm V_RNDNE_F64 : VOP1_Real_vi <0x19>; +defm V_LOG_LEGACY_F32 : VOP1_Real_vi <0x4c>; +defm V_EXP_LEGACY_F32 : VOP1_Real_vi <0x4b>; +defm V_CVT_F16_U16 : VOP1_Real_vi <0x39>; +defm V_CVT_F16_I16 : VOP1_Real_vi <0x3a>; +defm V_CVT_U16_F16 : VOP1_Real_vi <0x3b>; +defm V_CVT_I16_F16 : VOP1_Real_vi <0x3c>; +defm V_RCP_F16 : VOP1_Real_vi <0x3d>; +defm V_SQRT_F16 : VOP1_Real_vi <0x3e>; +defm V_RSQ_F16 : VOP1_Real_vi <0x3f>; +defm V_LOG_F16 : VOP1_Real_vi <0x40>; +defm V_EXP_F16 : VOP1_Real_vi <0x41>; +defm V_FREXP_MANT_F16 : VOP1_Real_vi <0x42>; +defm V_FREXP_EXP_I16_F16 : VOP1_Real_vi <0x43>; +defm V_FLOOR_F16 : VOP1_Real_vi <0x44>; +defm V_CEIL_F16 : VOP1_Real_vi <0x45>; +defm V_TRUNC_F16 : VOP1_Real_vi <0x46>; +defm V_RNDNE_F16 : VOP1_Real_vi <0x47>; +defm V_FRACT_F16 : VOP1_Real_vi <0x48>; +defm V_SIN_F16 : VOP1_Real_vi <0x49>; +defm V_COS_F16 : VOP1_Real_vi <0x4a>; + + +// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR +// indexing mode. vdst can't be treated as a def for codegen purposes, +// and an implicit use and def of the super register should be added. +def V_MOV_B32_indirect : VPseudoInstSI<(outs), + (ins getVALUDstForVT<i32>.ret:$vdst, getVOPSrc0ForVT<i32>.ret:$src0)>, + PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst, + getVOPSrc0ForVT<i32>.ret:$src0)> { + let VOP1 = 1; + let SubtargetPredicate = isVI; +} + +// This is a pseudo variant of the v_movreld_b32 instruction in which the +// vector operand appears only twice, once as def and once as use. Using this +// pseudo avoids problems with the Two Address instructions pass. +class V_MOVRELD_B32_pseudo<RegisterClass rc> : VPseudoInstSI < + (outs rc:$vdst), + (ins rc:$vsrc, VSrc_b32:$val, i32imm:$offset)> { + let VOP1 = 1; + + let Constraints = "$vsrc = $vdst"; + let Uses = [M0, EXEC]; + + let SubtargetPredicate = HasMovrel; +} + +def V_MOVRELD_B32_V1 : V_MOVRELD_B32_pseudo<VGPR_32>; +def V_MOVRELD_B32_V2 : V_MOVRELD_B32_pseudo<VReg_64>; +def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>; +def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>; +def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>; + +let Predicates = [isVI] in { + +def : Pat < + (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, + imm:$bound_ctrl)), + (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), + (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) +>; + + +def : Pat< + (i32 (anyext i16:$src)), + (COPY $src) +>; + +def : Pat< + (i64 (anyext i16:$src)), + (REG_SEQUENCE VReg_64, + (i32 (COPY $src)), sub0, + (V_MOV_B32_e32 (i32 0)), sub1) +>; + +def : Pat< + (i16 (trunc i32:$src)), + (COPY $src) +>; + +def : Pat < + (i16 (trunc i64:$src)), + (EXTRACT_SUBREG $src, sub0) +>; + +} // End Predicates = [isVI] diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td new file mode 100644 index 0000000..00e5ab3 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -0,0 +1,757 @@ +//===-- VOP2Instructions.td - Vector Instruction Defintions ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VOP2 Classes +//===----------------------------------------------------------------------===// + +class VOP2e <bits<6> op, VOPProfile P> : Enc32 { + bits<8> vdst; + bits<9> src0; + bits<8> src1; + + let Inst{8-0} = !if(P.HasSrc0, src0, 0); + let Inst{16-9} = !if(P.HasSrc1, src1, 0); + let Inst{24-17} = !if(P.EmitDst, vdst, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; //encoding +} + +class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 { + bits<8> vdst; + bits<9> src0; + bits<8> src1; + bits<32> imm; + + let Inst{8-0} = !if(P.HasSrc0, src0, 0); + let Inst{16-9} = !if(P.HasSrc1, src1, 0); + let Inst{24-17} = !if(P.EmitDst, vdst, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding + let Inst{63-32} = imm; +} + +class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> { + bits<8> vdst; + bits<8> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding +} + +class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> : + InstSI <P.Outs32, P.Ins32, "", pattern>, + VOP <opName>, + SIMCInstr <opName#suffix, SIEncodingFamily.NONE>, + MnemonicAlias<opName#suffix, opName> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + string AsmOperands = P.Asm32; + + let Size = 4; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SubtargetPredicate = isGCN; + + let VOP2 = 1; + let VALU = 1; + let Uses = [EXEC]; + + let AsmVariantName = AMDGPUAsmVariants.Default; + + VOPProfile Pfl = P; +} + +class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + SIMCInstr <ps.PseudoInstr, EncodingFamily> { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let AsmVariantName = ps.AsmVariantName; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; +} + +class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_SDWA_Pseudo <OpName, P, pattern> { + let AsmMatchConverter = "cvtSdwaVOP2"; +} + +class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { + list<dag> ret = !if(P.HasModifiers, + [(set P.DstVT:$vdst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]); +} + +multiclass VOP2Inst <string opName, + VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> { + + def _e32 : VOP2_Pseudo <opName, P>, + Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; + + def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, + Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; + + def _sdwa : VOP2_SDWA_Pseudo <opName, P>, + Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>; +} + +// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst +multiclass VOP2bInst <string opName, + VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName, + bit useSGPRInput = !eq(P.NumSrcArgs, 3)> { + + let SchedRW = [Write32Bit, WriteSALU] in { + let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { + def _e32 : VOP2_Pseudo <opName, P>, + Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; + + def _sdwa : VOP2_SDWA_Pseudo <opName, P>, + Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>; + } + def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, + Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; + } +} + +multiclass VOP2eInst <string opName, + VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName, + bit useSGPRInput = !eq(P.NumSrcArgs, 3)> { + + let SchedRW = [Write32Bit] in { + let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in { + def _e32 : VOP2_Pseudo <opName, P>, + Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; + } + def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, + Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; + } +} + +class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { + field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); + field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm); + field string Asm32 = "$vdst, $src0, $src1, $imm"; + field bit HasExt = 0; +} + +def VOP_MADAK_F16 : VOP_MADAK <f16>; +def VOP_MADAK_F32 : VOP_MADAK <f32>; + +class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { + field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); + field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1); + field string Asm32 = "$vdst, $src0, $imm, $src1"; + field bit HasExt = 0; +} + +def VOP_MADMK_F16 : VOP_MADMK <f16>; +def VOP_MADMK_F32 : VOP_MADMK <f32>; + +class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); + let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, + HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret; + let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + Src1ModDPP:$src1_modifiers, Src1DPP:$src1, + VGPR_32:$src2, // stub argument + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, + VGPR_32:$src2, // stub argument + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel); + let Asm32 = getAsm32<1, 2, vt>.ret; + let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; + let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret; + let HasSrc2 = 0; + let HasSrc2Mods = 0; + let HasExt = 1; +} + +def VOP_MAC_F16 : VOP_MAC <f16> { + // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives + // 'not a string initializer' error. + let Asm64 = getAsm64<1, 2, HasModifiers, f16>.ret; +} + +def VOP_MAC_F32 : VOP_MAC <f32> { + // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives + // 'not a string initializer' error. + let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret; +} + +// Write out to vcc or arbitrary SGPR. +def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { + let Asm32 = "$vdst, vcc, $src0, $src1"; + let Asm64 = "$vdst, $sdst, $src0, $src1"; + let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; + let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let Outs32 = (outs DstRC:$vdst); + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); +} + +// Write out to vcc or arbitrary SGPR and read in from vcc or +// arbitrary SGPR. +def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { + // We use VCSrc_b32 to exclude literal constants, even though the + // encoding normally allows them since the implicit VCC use means + // using one would always violate the constant bus + // restriction. SGPRs are still allowed because it should + // technically be possible to use VCC again as src0. + let Src0RC32 = VCSrc_b32; + let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; + let Asm64 = "$vdst, $sdst, $src0, $src1, $src2"; + let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; + let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let Outs32 = (outs DstRC:$vdst); + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + + // Suppress src2 implied by type since the 32-bit encoding uses an + // implicit VCC use. + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); + + let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0, + Src1Mod:$src1_modifiers, Src1SDWA:$src1, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel); + + let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0, + Src1Mod:$src1_modifiers, Src1DPP:$src1, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let HasExt = 1; +} + +// Read in from vcc or arbitrary SGPR +def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { + let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above. + let Asm32 = "$vdst, $src0, $src1, vcc"; + let Asm64 = "$vdst, $src0, $src1, $src2"; + let Outs32 = (outs DstRC:$vdst); + let Outs64 = (outs DstRC:$vdst); + + // Suppress src2 implied by type since the 32-bit encoding uses an + // implicit VCC use. + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); +} + +def VOP_READLANE : VOPProfile<[i32, i32, i32]> { + let Outs32 = (outs SReg_32:$vdst); + let Outs64 = Outs32; + let Ins32 = (ins VGPR_32:$src0, SCSrc_b32:$src1); + let Ins64 = Ins32; + let Asm32 = " $vdst, $src0, $src1"; + let Asm64 = Asm32; +} + +def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> { + let Outs32 = (outs VGPR_32:$vdst); + let Outs64 = Outs32; + let Ins32 = (ins SReg_32:$src0, SCSrc_b32:$src1); + let Ins64 = Ins32; + let Asm32 = " $vdst, $src0, $src1"; + let Asm64 = Asm32; +} + +//===----------------------------------------------------------------------===// +// VOP2 Instructions +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isGCN in { + +defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; +def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>; + +let isCommutable = 1 in { +defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>; +defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>; +defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">; +defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>; +defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>; +defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>; +defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>; +defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>; +defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>; +defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>; +defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>; +defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_I32_I32_I32>; +defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_I32_I32_I32>; +defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_I32_I32_I32>; +defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_I32_I32_I32>; +defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">; +defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">; +defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">; +defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_I32_I32_I32>; +defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_I32_I32_I32>; +defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_I32_I32_I32>; + +let Constraints = "$vdst = $src2", DisableEncoding="$src2", + isConvertibleToThreeAddress = 1 in { +defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>; +} + +def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>; + +// No patterns so that the scalar instructions are always selected. +// The scalar versions will be replaced with vector when needed later. + +// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, +// but the VI instructions behave the same as the SI versions. +defm V_ADD_I32 : VOP2bInst <"v_add_i32", VOP2b_I32_I1_I32_I32>; +defm V_SUB_I32 : VOP2bInst <"v_sub_i32", VOP2b_I32_I1_I32_I32>; +defm V_SUBREV_I32 : VOP2bInst <"v_subrev_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32">; +defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1>; +defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1>; +defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32">; +} // End isCommutable = 1 + +// These are special and do not read the exec mask. +let isConvergent = 1, Uses = []<Register> in { +def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, + [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">; + +def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">; +} // End isConvergent = 1 + +defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>; +defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32>; +defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>; +defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>; +defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>; +defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst" +defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>; +defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>; +defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16>; +defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>; +defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>; + +} // End SubtargetPredicate = isGCN + + +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { + +defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>; +defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>; + +let isCommutable = 1 in { +defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>; +defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>; +defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>; +defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; +} // End isCommutable = 1 + +} // End let SubtargetPredicate = SICI + +let SubtargetPredicate = isVI in { + +def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>; +defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; +defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; +defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>; +defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; + +let isCommutable = 1 in { +defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>; +defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>; +defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">; +defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; +def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>; +defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; +defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; +defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; +defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>; +defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>; +defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>; +defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>; +defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>; +defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>; +defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>; + +let Constraints = "$vdst = $src2", DisableEncoding="$src2", + isConvertibleToThreeAddress = 1 in { +defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; +} +} // End isCommutable = 1 + +} // End SubtargetPredicate = isVI + +// Note: 16-bit instructions produce a 0 result in the high 16-bits. +multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> { + +def : Pat< + (op i16:$src0, i16:$src1), + (inst $src0, $src1) +>; + +def : Pat< + (i32 (zext (op i16:$src0, i16:$src1))), + (inst $src0, $src1) +>; + +def : Pat< + (i64 (zext (op i16:$src0, i16:$src1))), + (REG_SEQUENCE VReg_64, + (inst $src0, $src1), sub0, + (V_MOV_B32_e32 (i32 0)), sub1) +>; + +} + +multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> { + +def : Pat< + (op i16:$src0, i16:$src1), + (inst $src1, $src0) +>; + +def : Pat< + (i32 (zext (op i16:$src0, i16:$src1))), + (inst $src1, $src0) +>; + + +def : Pat< + (i64 (zext (op i16:$src0, i16:$src1))), + (REG_SEQUENCE VReg_64, + (inst $src1, $src0), sub0, + (V_MOV_B32_e32 (i32 0)), sub1) +>; +} + +class ZExt_i16_i1_Pat <SDNode ext> : Pat < + (i16 (ext i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src) +>; + +let Predicates = [isVI] in { + +defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>; +defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>; +defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>; +defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>; +defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>; +defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>; +defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>; + +def : Pat < + (and i16:$src0, i16:$src1), + (V_AND_B32_e64 $src0, $src1) +>; + +def : Pat < + (or i16:$src0, i16:$src1), + (V_OR_B32_e64 $src0, $src1) +>; + +def : Pat < + (xor i16:$src0, i16:$src1), + (V_XOR_B32_e64 $src0, $src1) +>; + +defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>; +defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>; +defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>; + +def : ZExt_i16_i1_Pat<zext>; +def : ZExt_i16_i1_Pat<anyext>; + +def : Pat < + (i16 (sext i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src) +>; + +} // End Predicates = [isVI] + +//===----------------------------------------------------------------------===// +// SI +//===----------------------------------------------------------------------===// + +let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in { + +multiclass VOP2_Real_si <bits<6> op> { + def _si : + VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; +} + +multiclass VOP2_Real_MADK_si <bits<6> op> { + def _si : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; +} + +multiclass VOP2_Real_e32_si <bits<6> op> { + def _e32_si : + VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>; +} + +multiclass VOP2_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> { + def _e64_si : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3e_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; +} + +multiclass VOP2be_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> { + def _e64_si : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3be_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; +} + +} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" + +defm V_CNDMASK_B32 : VOP2_Real_e32e64_si <0x0>; +defm V_ADD_F32 : VOP2_Real_e32e64_si <0x3>; +defm V_SUB_F32 : VOP2_Real_e32e64_si <0x4>; +defm V_SUBREV_F32 : VOP2_Real_e32e64_si <0x5>; +defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_si <0x7>; +defm V_MUL_F32 : VOP2_Real_e32e64_si <0x8>; +defm V_MUL_I32_I24 : VOP2_Real_e32e64_si <0x9>; +defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_si <0xa>; +defm V_MUL_U32_U24 : VOP2_Real_e32e64_si <0xb>; +defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_si <0xc>; +defm V_MIN_F32 : VOP2_Real_e32e64_si <0xf>; +defm V_MAX_F32 : VOP2_Real_e32e64_si <0x10>; +defm V_MIN_I32 : VOP2_Real_e32e64_si <0x11>; +defm V_MAX_I32 : VOP2_Real_e32e64_si <0x12>; +defm V_MIN_U32 : VOP2_Real_e32e64_si <0x13>; +defm V_MAX_U32 : VOP2_Real_e32e64_si <0x14>; +defm V_LSHRREV_B32 : VOP2_Real_e32e64_si <0x16>; +defm V_ASHRREV_I32 : VOP2_Real_e32e64_si <0x18>; +defm V_LSHLREV_B32 : VOP2_Real_e32e64_si <0x1a>; +defm V_AND_B32 : VOP2_Real_e32e64_si <0x1b>; +defm V_OR_B32 : VOP2_Real_e32e64_si <0x1c>; +defm V_XOR_B32 : VOP2_Real_e32e64_si <0x1d>; +defm V_MAC_F32 : VOP2_Real_e32e64_si <0x1f>; +defm V_MADMK_F32 : VOP2_Real_MADK_si <0x20>; +defm V_MADAK_F32 : VOP2_Real_MADK_si <0x21>; +defm V_ADD_I32 : VOP2be_Real_e32e64_si <0x25>; +defm V_SUB_I32 : VOP2be_Real_e32e64_si <0x26>; +defm V_SUBREV_I32 : VOP2be_Real_e32e64_si <0x27>; +defm V_ADDC_U32 : VOP2be_Real_e32e64_si <0x28>; +defm V_SUBB_U32 : VOP2be_Real_e32e64_si <0x29>; +defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>; + +defm V_READLANE_B32 : VOP2_Real_si <0x01>; +defm V_WRITELANE_B32 : VOP2_Real_si <0x02>; + +defm V_MAC_LEGACY_F32 : VOP2_Real_e32e64_si <0x6>; +defm V_MIN_LEGACY_F32 : VOP2_Real_e32e64_si <0xd>; +defm V_MAX_LEGACY_F32 : VOP2_Real_e32e64_si <0xe>; +defm V_LSHR_B32 : VOP2_Real_e32e64_si <0x15>; +defm V_ASHR_I32 : VOP2_Real_e32e64_si <0x17>; +defm V_LSHL_B32 : VOP2_Real_e32e64_si <0x19>; + +defm V_BFM_B32 : VOP2_Real_e32e64_si <0x1e>; +defm V_BCNT_U32_B32 : VOP2_Real_e32e64_si <0x22>; +defm V_MBCNT_LO_U32_B32 : VOP2_Real_e32e64_si <0x23>; +defm V_MBCNT_HI_U32_B32 : VOP2_Real_e32e64_si <0x24>; +defm V_LDEXP_F32 : VOP2_Real_e32e64_si <0x2b>; +defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e32e64_si <0x2c>; +defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e32e64_si <0x2d>; +defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e32e64_si <0x2e>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e32e64_si <0x2f>; +defm V_CVT_PK_U16_U32 : VOP2_Real_e32e64_si <0x30>; +defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>; + + +//===----------------------------------------------------------------------===// +// VI +//===----------------------------------------------------------------------===// + +class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, VOPProfile P = ps.Pfl> : + VOP_DPP <ps.OpName, P> { + let Defs = ps.Defs; + let Uses = ps.Uses; + let SchedRW = ps.SchedRW; + let hasSideEffects = ps.hasSideEffects; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + bits<8> vdst; + bits<8> src1; + let Inst{8-0} = 0xfa; //dpp + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; //encoding +} + +let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { + +multiclass VOP32_Real_vi <bits<10> op> { + def _vi : + VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3e_vi<op, !cast<VOP2_Pseudo>(NAME).Pfl>; +} + +multiclass VOP2_Real_MADK_vi <bits<6> op> { + def _vi : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; +} + +multiclass VOP2_Real_e32_vi <bits<6> op> { + def _e32_vi : + VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>; +} + +multiclass VOP2_Real_e64_vi <bits<10> op> { + def _e64_vi : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, + VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; +} + +multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> { + def _e64_vi : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, + VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; +} + +multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> : + VOP2_Real_e32_vi<op>, + VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>; + +} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" + +multiclass VOP2_SDWA_Real <bits<6> op> { + def _sdwa_vi : + VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; +} + +multiclass VOP2be_Real_e32e64_vi <bits<6> op> : + Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> { + // For now left dpp only for asm/dasm + // TODO: add corresponding pseudo + def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; +} + +multiclass VOP2_Real_e32e64_vi <bits<6> op> : + Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> { + // For now left dpp only for asm/dasm + // TODO: add corresponding pseudo + def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; +} + +defm V_CNDMASK_B32 : Base_VOP2_Real_e32e64_vi <0x0>; +defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>; +defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>; +defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>; +defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_vi <0x4>; +defm V_MUL_F32 : VOP2_Real_e32e64_vi <0x5>; +defm V_MUL_I32_I24 : VOP2_Real_e32e64_vi <0x6>; +defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_vi <0x7>; +defm V_MUL_U32_U24 : VOP2_Real_e32e64_vi <0x8>; +defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_vi <0x9>; +defm V_MIN_F32 : VOP2_Real_e32e64_vi <0xa>; +defm V_MAX_F32 : VOP2_Real_e32e64_vi <0xb>; +defm V_MIN_I32 : VOP2_Real_e32e64_vi <0xc>; +defm V_MAX_I32 : VOP2_Real_e32e64_vi <0xd>; +defm V_MIN_U32 : VOP2_Real_e32e64_vi <0xe>; +defm V_MAX_U32 : VOP2_Real_e32e64_vi <0xf>; +defm V_LSHRREV_B32 : VOP2_Real_e32e64_vi <0x10>; +defm V_ASHRREV_I32 : VOP2_Real_e32e64_vi <0x11>; +defm V_LSHLREV_B32 : VOP2_Real_e32e64_vi <0x12>; +defm V_AND_B32 : VOP2_Real_e32e64_vi <0x13>; +defm V_OR_B32 : VOP2_Real_e32e64_vi <0x14>; +defm V_XOR_B32 : VOP2_Real_e32e64_vi <0x15>; +defm V_MAC_F32 : VOP2_Real_e32e64_vi <0x16>; +defm V_MADMK_F32 : VOP2_Real_MADK_vi <0x17>; +defm V_MADAK_F32 : VOP2_Real_MADK_vi <0x18>; +defm V_ADD_I32 : VOP2be_Real_e32e64_vi <0x19>; +defm V_SUB_I32 : VOP2be_Real_e32e64_vi <0x1a>; +defm V_SUBREV_I32 : VOP2be_Real_e32e64_vi <0x1b>; +defm V_ADDC_U32 : VOP2be_Real_e32e64_vi <0x1c>; +defm V_SUBB_U32 : VOP2be_Real_e32e64_vi <0x1d>; +defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi <0x1e>; + +defm V_READLANE_B32 : VOP32_Real_vi <0x289>; +defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>; + +defm V_BFM_B32 : VOP2_Real_e64_vi <0x293>; +defm V_BCNT_U32_B32 : VOP2_Real_e64_vi <0x28b>; +defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64_vi <0x28c>; +defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64_vi <0x28d>; +defm V_LDEXP_F32 : VOP2_Real_e64_vi <0x288>; +defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64_vi <0x1f0>; +defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64_vi <0x294>; +defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64_vi <0x295>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64_vi <0x296>; +defm V_CVT_PK_U16_U32 : VOP2_Real_e64_vi <0x297>; +defm V_CVT_PK_I16_I32 : VOP2_Real_e64_vi <0x298>; + +defm V_ADD_F16 : VOP2_Real_e32e64_vi <0x1f>; +defm V_SUB_F16 : VOP2_Real_e32e64_vi <0x20>; +defm V_SUBREV_F16 : VOP2_Real_e32e64_vi <0x21>; +defm V_MUL_F16 : VOP2_Real_e32e64_vi <0x22>; +defm V_MAC_F16 : VOP2_Real_e32e64_vi <0x23>; +defm V_MADMK_F16 : VOP2_Real_MADK_vi <0x24>; +defm V_MADAK_F16 : VOP2_Real_MADK_vi <0x25>; +defm V_ADD_U16 : VOP2_Real_e32e64_vi <0x26>; +defm V_SUB_U16 : VOP2_Real_e32e64_vi <0x27>; +defm V_SUBREV_U16 : VOP2_Real_e32e64_vi <0x28>; +defm V_MUL_LO_U16 : VOP2_Real_e32e64_vi <0x29>; +defm V_LSHLREV_B16 : VOP2_Real_e32e64_vi <0x2a>; +defm V_LSHRREV_B16 : VOP2_Real_e32e64_vi <0x2b>; +defm V_ASHRREV_I16 : VOP2_Real_e32e64_vi <0x2c>; +defm V_MAX_F16 : VOP2_Real_e32e64_vi <0x2d>; +defm V_MIN_F16 : VOP2_Real_e32e64_vi <0x2e>; +defm V_MAX_U16 : VOP2_Real_e32e64_vi <0x2f>; +defm V_MAX_I16 : VOP2_Real_e32e64_vi <0x30>; +defm V_MIN_U16 : VOP2_Real_e32e64_vi <0x31>; +defm V_MIN_I16 : VOP2_Real_e32e64_vi <0x32>; +defm V_LDEXP_F16 : VOP2_Real_e32e64_vi <0x33>; + +let SubtargetPredicate = isVI in { + +// Aliases to simplify matching of floating-point instructions that +// are VOP2 on SI and VOP3 on VI. +class SI2_VI3Alias <string name, Instruction inst> : InstAlias < + name#" $dst, $src0, $src1", + (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0) +>, PredicateControl { + let UseInstAsmMatchConverter = 0; + let AsmVariantName = AMDGPUAsmVariants.VOP3; +} + +def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; + +} // End SubtargetPredicate = isVI diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td new file mode 100644 index 0000000..c2a4d4b --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -0,0 +1,451 @@ +//===-- VOP3Instructions.td - Vector Instruction Defintions ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VOP3 Classes +//===----------------------------------------------------------------------===// + +class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { + list<dag> ret3 = [(set P.DstVT:$vdst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))]; + + list<dag> ret2 = [(set P.DstVT:$vdst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))]; + + list<dag> ret1 = [(set P.DstVT:$vdst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))]; + + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + +class getVOP3Pat<VOPProfile P, SDPatternOperator node> { + list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))]; + list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]; + list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]; + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + +class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> : + VOP3_Pseudo<OpName, P, + !if(P.HasModifiers, getVOP3ModPat<P, node>.ret, getVOP3Pat<P, node>.ret), + VOP3Only>; + +// Special case for v_div_fmas_{f32|f64}, since it seems to be the +// only VOP instruction that implicitly reads VCC. +let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in { +def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> { + let Outs64 = (outs DstRC.RegClass:$vdst); +} +def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> { + let Outs64 = (outs DstRC.RegClass:$vdst); +} +} + +class getVOP3VCC<VOPProfile P, SDPatternOperator node> { + list<dag> ret = + [(set P.DstVT:$vdst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)), + (i1 VCC)))]; +} + +class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> { + // FIXME: Hack to stop printing _e64 + let Outs64 = (outs DstRC.RegClass:$vdst); + let Asm64 = " " # P.Asm64; +} + +class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { + // v_div_scale_{f32|f64} do not support input modifiers. + let HasModifiers = 0; + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Asm64 = " $vdst, $sdst, $src0, $src1, $src2"; +} + +def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> { + // FIXME: Hack to stop printing _e64 + let DstRC = RegisterOperand<VGPR_32>; +} + +def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> { + // FIXME: Hack to stop printing _e64 + let DstRC = RegisterOperand<VReg_64>; +} + +//===----------------------------------------------------------------------===// +// VOP3 Instructions +//===----------------------------------------------------------------------===// + +let isCommutable = 1 in { + +def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; +def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>; +def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUmad_i24>; +def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUmad_u24>; +def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>; +def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>; +def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; + +let SchedRW = [WriteDoubleAdd] in { +def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>; +def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>; +def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>; +def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>; +} // End SchedRW = [WriteDoubleAdd] + +let SchedRW = [WriteQuarterRate32] in { +def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>>; +def V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>; +def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>; +def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>; +} // End SchedRW = [WriteQuarterRate32] + +let Uses = [VCC, EXEC] in { +// v_div_fmas_f32: +// result = src0 * src1 + src2 +// if (vcc) +// result *= 2^32 +// +def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, + getVOP3VCC<VOP_F32_F32_F32_F32_VCC, AMDGPUdiv_fmas>.ret> { + let SchedRW = [WriteFloatFMA]; +} +// v_div_fmas_f64: +// result = src0 * src1 + src2 +// if (vcc) +// result *= 2^64 +// +def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, + getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> { + let SchedRW = [WriteDouble]; +} +} // End Uses = [VCC, EXEC] + +} // End isCommutable = 1 + +def V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>; +def V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>; +def V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>; +def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>; +def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>; +def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>; +def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>; +def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>; +def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>; +def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>; +def V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>; +def V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>; +def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>; +def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>; +def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>; +def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>; +def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_u8>; +def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_hi_u8>; +def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_u16>; +def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>; +def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>; + +let SchedRW = [WriteDoubleAdd] in { +def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>; +def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>; +} // End SchedRW = [WriteDoubleAdd] + +def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> { + let SchedRW = [WriteFloatFMA, WriteSALU]; + let hasExtraSrcRegAllocReq = 1; + let AsmMatchConverter = ""; +} + +// Double precision division pre-scale. +def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> { + let SchedRW = [WriteDouble, WriteSALU]; + let hasExtraSrcRegAllocReq = 1; + let AsmMatchConverter = ""; +} + +def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>; +def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>; + +def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> { + let SchedRW = [WriteDouble]; +} + +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { +def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>; +def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>>; +def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>>; +def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; +} // End SubtargetPredicate = isSICI + +let SubtargetPredicate = isVI in { +def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>; +def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>; +def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>; +} // End SubtargetPredicate = isVI + + +let SubtargetPredicate = isCIVI in { + +def V_MQSAD_U16_U8 : VOP3Inst <"v_mqsad_u16_u8", VOP3_Profile<VOP_I32_I32_I32>>; +def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>; +def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>; + +let isCommutable = 1 in { +def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3_Profile<VOP_I64_I32_I32_I64>>; + +// XXX - Does this set VCC? +def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3_Profile<VOP_I64_I32_I32_I64>>; +} // End isCommutable = 1 + +} // End SubtargetPredicate = isCIVI + + +let SubtargetPredicate = isVI in { + +let isCommutable = 1 in { + +def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>; +def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>; +def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile<VOP_F32_F32_F16>>; +def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile<VOP_F32_F32_F16_F16>>; +def V_INTERP_P2_F16 : VOP3Inst <"v_interp_p2_f16", VOP3_Profile<VOP_F16_F32_F16_F32>>; +def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>; + +def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>; +def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>; + +} // End isCommutable = 1 + +} // End SubtargetPredicate = isVI + +let Predicates = [isVI] in { + +multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, + Instruction inst, SDPatternOperator op3> { +def : Pat< + (op2 (op1 i16:$src0, i16:$src1), i16:$src2), + (inst i16:$src0, i16:$src1, i16:$src2) +>; + +def : Pat< + (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), + (inst i16:$src0, i16:$src1, i16:$src2) +>; + +def : Pat< + (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), + (REG_SEQUENCE VReg_64, + (inst i16:$src0, i16:$src1, i16:$src2), sub0, + (V_MOV_B32_e32 (i32 0)), sub1) +>; +} + +defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>; +defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>; + +} // End Predicates = [isVI] + + +//===----------------------------------------------------------------------===// +// Target +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SI +//===----------------------------------------------------------------------===// + +let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in { + +multiclass VOP3_Real_si<bits<9> op> { + def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>; +} + +multiclass VOP3be_Real_si<bits<9> op> { + def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>; +} + +} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" + +defm V_MAD_LEGACY_F32 : VOP3_Real_si <0x140>; +defm V_MAD_F32 : VOP3_Real_si <0x141>; +defm V_MAD_I32_I24 : VOP3_Real_si <0x142>; +defm V_MAD_U32_U24 : VOP3_Real_si <0x143>; +defm V_CUBEID_F32 : VOP3_Real_si <0x144>; +defm V_CUBESC_F32 : VOP3_Real_si <0x145>; +defm V_CUBETC_F32 : VOP3_Real_si <0x146>; +defm V_CUBEMA_F32 : VOP3_Real_si <0x147>; +defm V_BFE_U32 : VOP3_Real_si <0x148>; +defm V_BFE_I32 : VOP3_Real_si <0x149>; +defm V_BFI_B32 : VOP3_Real_si <0x14a>; +defm V_FMA_F32 : VOP3_Real_si <0x14b>; +defm V_FMA_F64 : VOP3_Real_si <0x14c>; +defm V_LERP_U8 : VOP3_Real_si <0x14d>; +defm V_ALIGNBIT_B32 : VOP3_Real_si <0x14e>; +defm V_ALIGNBYTE_B32 : VOP3_Real_si <0x14f>; +defm V_MULLIT_F32 : VOP3_Real_si <0x150>; +defm V_MIN3_F32 : VOP3_Real_si <0x151>; +defm V_MIN3_I32 : VOP3_Real_si <0x152>; +defm V_MIN3_U32 : VOP3_Real_si <0x153>; +defm V_MAX3_F32 : VOP3_Real_si <0x154>; +defm V_MAX3_I32 : VOP3_Real_si <0x155>; +defm V_MAX3_U32 : VOP3_Real_si <0x156>; +defm V_MED3_F32 : VOP3_Real_si <0x157>; +defm V_MED3_I32 : VOP3_Real_si <0x158>; +defm V_MED3_U32 : VOP3_Real_si <0x159>; +defm V_SAD_U8 : VOP3_Real_si <0x15a>; +defm V_SAD_HI_U8 : VOP3_Real_si <0x15b>; +defm V_SAD_U16 : VOP3_Real_si <0x15c>; +defm V_SAD_U32 : VOP3_Real_si <0x15d>; +defm V_CVT_PK_U8_F32 : VOP3_Real_si <0x15e>; +defm V_DIV_FIXUP_F32 : VOP3_Real_si <0x15f>; +defm V_DIV_FIXUP_F64 : VOP3_Real_si <0x160>; +defm V_LSHL_B64 : VOP3_Real_si <0x161>; +defm V_LSHR_B64 : VOP3_Real_si <0x162>; +defm V_ASHR_I64 : VOP3_Real_si <0x163>; +defm V_ADD_F64 : VOP3_Real_si <0x164>; +defm V_MUL_F64 : VOP3_Real_si <0x165>; +defm V_MIN_F64 : VOP3_Real_si <0x166>; +defm V_MAX_F64 : VOP3_Real_si <0x167>; +defm V_LDEXP_F64 : VOP3_Real_si <0x168>; +defm V_MUL_LO_U32 : VOP3_Real_si <0x169>; +defm V_MUL_HI_U32 : VOP3_Real_si <0x16a>; +defm V_MUL_LO_I32 : VOP3_Real_si <0x16b>; +defm V_MUL_HI_I32 : VOP3_Real_si <0x16c>; +defm V_DIV_SCALE_F32 : VOP3be_Real_si <0x16d>; +defm V_DIV_SCALE_F64 : VOP3be_Real_si <0x16e>; +defm V_DIV_FMAS_F32 : VOP3_Real_si <0x16f>; +defm V_DIV_FMAS_F64 : VOP3_Real_si <0x170>; +defm V_MSAD_U8 : VOP3_Real_si <0x171>; +defm V_MQSAD_PK_U16_U8 : VOP3_Real_si <0x173>; +defm V_TRIG_PREOP_F64 : VOP3_Real_si <0x174>; + +//===----------------------------------------------------------------------===// +// CI +//===----------------------------------------------------------------------===// + +multiclass VOP3_Real_ci<bits<9> op> { + def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> { + let AssemblerPredicates = [isCIOnly]; + let DecoderNamespace = "CI"; + } +} + +defm V_MQSAD_U16_U8 : VOP3_Real_ci <0x172>; +defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>; +defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x174>; +defm V_MAD_U64_U32 : VOP3_Real_ci <0x176>; +defm V_MAD_I64_I32 : VOP3_Real_ci <0x177>; + +//===----------------------------------------------------------------------===// +// VI +//===----------------------------------------------------------------------===// + +let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { + +multiclass VOP3_Real_vi<bits<10> op> { + def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>; +} + +multiclass VOP3be_Real_vi<bits<10> op> { + def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3be_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>; +} + +} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" + +defm V_MQSAD_U16_U8 : VOP3_Real_vi <0x172>; +defm V_MAD_U64_U32 : VOP3_Real_vi <0x176>; +defm V_MAD_I64_I32 : VOP3_Real_vi <0x177>; + +defm V_MAD_LEGACY_F32 : VOP3_Real_vi <0x1c0>; +defm V_MAD_F32 : VOP3_Real_vi <0x1c1>; +defm V_MAD_I32_I24 : VOP3_Real_vi <0x1c2>; +defm V_MAD_U32_U24 : VOP3_Real_vi <0x1c3>; +defm V_CUBEID_F32 : VOP3_Real_vi <0x1c4>; +defm V_CUBESC_F32 : VOP3_Real_vi <0x1c5>; +defm V_CUBETC_F32 : VOP3_Real_vi <0x1c6>; +defm V_CUBEMA_F32 : VOP3_Real_vi <0x1c7>; +defm V_BFE_U32 : VOP3_Real_vi <0x1c8>; +defm V_BFE_I32 : VOP3_Real_vi <0x1c9>; +defm V_BFI_B32 : VOP3_Real_vi <0x1ca>; +defm V_FMA_F32 : VOP3_Real_vi <0x1cb>; +defm V_FMA_F64 : VOP3_Real_vi <0x1cc>; +defm V_LERP_U8 : VOP3_Real_vi <0x1cd>; +defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>; +defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>; +defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>; +defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>; +defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>; +defm V_MAX3_F32 : VOP3_Real_vi <0x1d3>; +defm V_MAX3_I32 : VOP3_Real_vi <0x1d4>; +defm V_MAX3_U32 : VOP3_Real_vi <0x1d5>; +defm V_MED3_F32 : VOP3_Real_vi <0x1d6>; +defm V_MED3_I32 : VOP3_Real_vi <0x1d7>; +defm V_MED3_U32 : VOP3_Real_vi <0x1d8>; +defm V_SAD_U8 : VOP3_Real_vi <0x1d9>; +defm V_SAD_HI_U8 : VOP3_Real_vi <0x1da>; +defm V_SAD_U16 : VOP3_Real_vi <0x1db>; +defm V_SAD_U32 : VOP3_Real_vi <0x1dc>; +defm V_CVT_PK_U8_F32 : VOP3_Real_vi <0x1dd>; +defm V_DIV_FIXUP_F32 : VOP3_Real_vi <0x1de>; +defm V_DIV_FIXUP_F64 : VOP3_Real_vi <0x1df>; +defm V_DIV_SCALE_F32 : VOP3be_Real_vi <0x1e0>; +defm V_DIV_SCALE_F64 : VOP3be_Real_vi <0x1e1>; +defm V_DIV_FMAS_F32 : VOP3_Real_vi <0x1e2>; +defm V_DIV_FMAS_F64 : VOP3_Real_vi <0x1e3>; +defm V_MSAD_U8 : VOP3_Real_vi <0x1e4>; +defm V_QSAD_PK_U16_U8 : VOP3_Real_vi <0x1e5>; +defm V_MQSAD_PK_U16_U8 : VOP3_Real_vi <0x1e6>; +defm V_MQSAD_U32_U8 : VOP3_Real_vi <0x1e7>; + +defm V_MAD_F16 : VOP3_Real_vi <0x1ea>; +defm V_MAD_U16 : VOP3_Real_vi <0x1eb>; +defm V_MAD_I16 : VOP3_Real_vi <0x1ec>; + +defm V_FMA_F16 : VOP3_Real_vi <0x1ee>; +defm V_DIV_FIXUP_F16 : VOP3_Real_vi <0x1ef>; + +defm V_INTERP_P1LL_F16 : VOP3_Real_vi <0x274>; +defm V_INTERP_P1LV_F16 : VOP3_Real_vi <0x275>; +defm V_INTERP_P2_F16 : VOP3_Real_vi <0x276>; +defm V_ADD_F64 : VOP3_Real_vi <0x280>; +defm V_MUL_F64 : VOP3_Real_vi <0x281>; +defm V_MIN_F64 : VOP3_Real_vi <0x282>; +defm V_MAX_F64 : VOP3_Real_vi <0x283>; +defm V_LDEXP_F64 : VOP3_Real_vi <0x284>; +defm V_MUL_LO_U32 : VOP3_Real_vi <0x285>; + +// removed from VI as identical to V_MUL_LO_U32 +let isAsmParserOnly = 1 in { +defm V_MUL_LO_I32 : VOP3_Real_vi <0x285>; +} + +defm V_MUL_HI_U32 : VOP3_Real_vi <0x286>; +defm V_MUL_HI_I32 : VOP3_Real_vi <0x287>; + +defm V_LSHLREV_B64 : VOP3_Real_vi <0x28f>; +defm V_LSHRREV_B64 : VOP3_Real_vi <0x290>; +defm V_ASHRREV_I64 : VOP3_Real_vi <0x291>; +defm V_TRIG_PREOP_F64 : VOP3_Real_vi <0x292>; diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td new file mode 100644 index 0000000..16a456d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -0,0 +1,1144 @@ +//===-- VOPCInstructions.td - Vector Instruction Defintions ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Encodings +//===----------------------------------------------------------------------===// + +class VOPCe <bits<8> op> : Enc32 { + bits<9> src0; + bits<8> src1; + + let Inst{8-0} = src0; + let Inst{16-9} = src1; + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; +} + +class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> { + bits<8> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; // encoding + + // VOPC disallows dst_sel and dst_unused as they have no effect on destination + let Inst{42-40} = SDWA.DWORD; + let Inst{44-43} = SDWA.UNUSED_PRESERVE; +} + +//===----------------------------------------------------------------------===// +// VOPC classes +//===----------------------------------------------------------------------===// + +// VOPC instructions are a special case because for the 32-bit +// encoding, we want to display the implicit vcc write as if it were +// an explicit $dst. +class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt0> : + VOPProfile <[i1, vt0, vt1, untyped]> { + let Asm32 = "vcc, $src0, $src1"; + // The destination for 32-bit encoding is implicit. + let HasDst32 = 0; + let Outs64 = (outs VOPDstS64:$sdst); + list<SchedReadWrite> Schedule = sched; +} + +class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[]> : + InstSI<(outs), P.Ins32, "", pattern>, + VOP <opName>, + SIMCInstr<opName#"_e32", SIEncodingFamily.NONE> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + string AsmOperands = P.Asm32; + + let Size = 4; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + + let VALU = 1; + let VOPC = 1; + let Uses = [EXEC]; + let Defs = [VCC]; + + let SubtargetPredicate = isGCN; + + VOPProfile Pfl = P; +} + +class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.PseudoInstr # " " # ps.AsmOperands, []>, + SIMCInstr <ps.PseudoInstr, EncodingFamily> { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; +} + +class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_SDWA_Pseudo <OpName, P, pattern> { + let AsmMatchConverter = "cvtSdwaVOPC"; +} + +// This class is used only with VOPC instructions. Use $sdst for out operand +class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> : + InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl { + + field bit isCompare; + field bit isCommutable; + + let ResultInst = + !if (p.HasDst32, + !if (!eq(p.NumSrcArgs, 0), + // 1 dst, 0 src + (inst p.DstRC:$sdst), + !if (!eq(p.NumSrcArgs, 1), + // 1 dst, 1 src + (inst p.DstRC:$sdst, p.Src0RC32:$src0), + !if (!eq(p.NumSrcArgs, 2), + // 1 dst, 2 src + (inst p.DstRC:$sdst, p.Src0RC32:$src0, p.Src1RC32:$src1), + // else - unreachable + (inst)))), + // else + !if (!eq(p.NumSrcArgs, 2), + // 0 dst, 2 src + (inst p.Src0RC32:$src0, p.Src1RC32:$src1), + !if (!eq(p.NumSrcArgs, 1), + // 0 dst, 1 src + (inst p.Src0RC32:$src1), + // else + // 0 dst, 0 src + (inst)))); + + let AsmVariantName = AMDGPUAsmVariants.Default; + let SubtargetPredicate = AssemblerPredicate; +} + +multiclass VOPC_Pseudos <string opName, + VOPC_Profile P, + PatLeaf cond = COND_NULL, + string revOp = opName, + bit DefExec = 0> { + + def _e32 : VOPC_Pseudo <opName, P>, + Commutable_REV<revOp#"_e32", !eq(revOp, opName)> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = P.Schedule; + let isConvergent = DefExec; + let isCompare = 1; + let isCommutable = 1; + } + + def _e64 : VOP3_Pseudo<opName, P, + !if(P.HasModifiers, + [(set i1:$sdst, + (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + cond))], + [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))])>, + Commutable_REV<revOp#"_e64", !eq(revOp, opName)> { + let Defs = !if(DefExec, [EXEC], []); + let SchedRW = P.Schedule; + let isCompare = 1; + let isCommutable = 1; + } + + def _sdwa : VOPC_SDWA_Pseudo <opName, P>, + Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = P.Schedule; + let isConvergent = DefExec; + let isCompare = 1; + let isCommutable = 1; + } +} + +def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>; +def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>; +def VOPC_I1_F64_F64 : VOPC_Profile<[WriteDoubleAdd], f64>; +def VOPC_I1_I16_I16 : VOPC_Profile<[Write32Bit], i16>; +def VOPC_I1_I32_I32 : VOPC_Profile<[Write32Bit], i32>; +def VOPC_I1_I64_I64 : VOPC_Profile<[Write64Bit], i64>; + +multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>; + +multiclass VOPC_F32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>; + +multiclass VOPC_F64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>; + +multiclass VOPC_I16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_I16_I16, cond, revOp, 0>; + +multiclass VOPC_I32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>; + +multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>; + +multiclass VOPCX_F16 <string opName, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_F16_F16, COND_NULL, revOp, 1>; + +multiclass VOPCX_F32 <string opName, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_F32_F32, COND_NULL, revOp, 1>; + +multiclass VOPCX_F64 <string opName, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_F64_F64, COND_NULL, revOp, 1>; + +multiclass VOPCX_I16 <string opName, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_I16_I16, COND_NULL, revOp, 1>; + +multiclass VOPCX_I32 <string opName, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_I32_I32, COND_NULL, revOp, 1>; + +multiclass VOPCX_I64 <string opName, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_I64_I64, COND_NULL, revOp, 1>; + + +//===----------------------------------------------------------------------===// +// Compare instructions +//===----------------------------------------------------------------------===// + +defm V_CMP_F_F32 : VOPC_F32 <"v_cmp_f_f32">; +defm V_CMP_LT_F32 : VOPC_F32 <"v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">; +defm V_CMP_EQ_F32 : VOPC_F32 <"v_cmp_eq_f32", COND_OEQ>; +defm V_CMP_LE_F32 : VOPC_F32 <"v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">; +defm V_CMP_GT_F32 : VOPC_F32 <"v_cmp_gt_f32", COND_OGT>; +defm V_CMP_LG_F32 : VOPC_F32 <"v_cmp_lg_f32", COND_ONE>; +defm V_CMP_GE_F32 : VOPC_F32 <"v_cmp_ge_f32", COND_OGE>; +defm V_CMP_O_F32 : VOPC_F32 <"v_cmp_o_f32", COND_O>; +defm V_CMP_U_F32 : VOPC_F32 <"v_cmp_u_f32", COND_UO>; +defm V_CMP_NGE_F32 : VOPC_F32 <"v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">; +defm V_CMP_NLG_F32 : VOPC_F32 <"v_cmp_nlg_f32", COND_UEQ>; +defm V_CMP_NGT_F32 : VOPC_F32 <"v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">; +defm V_CMP_NLE_F32 : VOPC_F32 <"v_cmp_nle_f32", COND_UGT>; +defm V_CMP_NEQ_F32 : VOPC_F32 <"v_cmp_neq_f32", COND_UNE>; +defm V_CMP_NLT_F32 : VOPC_F32 <"v_cmp_nlt_f32", COND_UGE>; +defm V_CMP_TRU_F32 : VOPC_F32 <"v_cmp_tru_f32">; + +defm V_CMPX_F_F32 : VOPCX_F32 <"v_cmpx_f_f32">; +defm V_CMPX_LT_F32 : VOPCX_F32 <"v_cmpx_lt_f32", "v_cmpx_gt_f32">; +defm V_CMPX_EQ_F32 : VOPCX_F32 <"v_cmpx_eq_f32">; +defm V_CMPX_LE_F32 : VOPCX_F32 <"v_cmpx_le_f32", "v_cmpx_ge_f32">; +defm V_CMPX_GT_F32 : VOPCX_F32 <"v_cmpx_gt_f32">; +defm V_CMPX_LG_F32 : VOPCX_F32 <"v_cmpx_lg_f32">; +defm V_CMPX_GE_F32 : VOPCX_F32 <"v_cmpx_ge_f32">; +defm V_CMPX_O_F32 : VOPCX_F32 <"v_cmpx_o_f32">; +defm V_CMPX_U_F32 : VOPCX_F32 <"v_cmpx_u_f32">; +defm V_CMPX_NGE_F32 : VOPCX_F32 <"v_cmpx_nge_f32", "v_cmpx_nle_f32">; +defm V_CMPX_NLG_F32 : VOPCX_F32 <"v_cmpx_nlg_f32">; +defm V_CMPX_NGT_F32 : VOPCX_F32 <"v_cmpx_ngt_f32", "v_cmpx_nlt_f32">; +defm V_CMPX_NLE_F32 : VOPCX_F32 <"v_cmpx_nle_f32">; +defm V_CMPX_NEQ_F32 : VOPCX_F32 <"v_cmpx_neq_f32">; +defm V_CMPX_NLT_F32 : VOPCX_F32 <"v_cmpx_nlt_f32">; +defm V_CMPX_TRU_F32 : VOPCX_F32 <"v_cmpx_tru_f32">; + +defm V_CMP_F_F64 : VOPC_F64 <"v_cmp_f_f64">; +defm V_CMP_LT_F64 : VOPC_F64 <"v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">; +defm V_CMP_EQ_F64 : VOPC_F64 <"v_cmp_eq_f64", COND_OEQ>; +defm V_CMP_LE_F64 : VOPC_F64 <"v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">; +defm V_CMP_GT_F64 : VOPC_F64 <"v_cmp_gt_f64", COND_OGT>; +defm V_CMP_LG_F64 : VOPC_F64 <"v_cmp_lg_f64", COND_ONE>; +defm V_CMP_GE_F64 : VOPC_F64 <"v_cmp_ge_f64", COND_OGE>; +defm V_CMP_O_F64 : VOPC_F64 <"v_cmp_o_f64", COND_O>; +defm V_CMP_U_F64 : VOPC_F64 <"v_cmp_u_f64", COND_UO>; +defm V_CMP_NGE_F64 : VOPC_F64 <"v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">; +defm V_CMP_NLG_F64 : VOPC_F64 <"v_cmp_nlg_f64", COND_UEQ>; +defm V_CMP_NGT_F64 : VOPC_F64 <"v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">; +defm V_CMP_NLE_F64 : VOPC_F64 <"v_cmp_nle_f64", COND_UGT>; +defm V_CMP_NEQ_F64 : VOPC_F64 <"v_cmp_neq_f64", COND_UNE>; +defm V_CMP_NLT_F64 : VOPC_F64 <"v_cmp_nlt_f64", COND_UGE>; +defm V_CMP_TRU_F64 : VOPC_F64 <"v_cmp_tru_f64">; + +defm V_CMPX_F_F64 : VOPCX_F64 <"v_cmpx_f_f64">; +defm V_CMPX_LT_F64 : VOPCX_F64 <"v_cmpx_lt_f64", "v_cmpx_gt_f64">; +defm V_CMPX_EQ_F64 : VOPCX_F64 <"v_cmpx_eq_f64">; +defm V_CMPX_LE_F64 : VOPCX_F64 <"v_cmpx_le_f64", "v_cmpx_ge_f64">; +defm V_CMPX_GT_F64 : VOPCX_F64 <"v_cmpx_gt_f64">; +defm V_CMPX_LG_F64 : VOPCX_F64 <"v_cmpx_lg_f64">; +defm V_CMPX_GE_F64 : VOPCX_F64 <"v_cmpx_ge_f64">; +defm V_CMPX_O_F64 : VOPCX_F64 <"v_cmpx_o_f64">; +defm V_CMPX_U_F64 : VOPCX_F64 <"v_cmpx_u_f64">; +defm V_CMPX_NGE_F64 : VOPCX_F64 <"v_cmpx_nge_f64", "v_cmpx_nle_f64">; +defm V_CMPX_NLG_F64 : VOPCX_F64 <"v_cmpx_nlg_f64">; +defm V_CMPX_NGT_F64 : VOPCX_F64 <"v_cmpx_ngt_f64", "v_cmpx_nlt_f64">; +defm V_CMPX_NLE_F64 : VOPCX_F64 <"v_cmpx_nle_f64">; +defm V_CMPX_NEQ_F64 : VOPCX_F64 <"v_cmpx_neq_f64">; +defm V_CMPX_NLT_F64 : VOPCX_F64 <"v_cmpx_nlt_f64">; +defm V_CMPX_TRU_F64 : VOPCX_F64 <"v_cmpx_tru_f64">; + +let SubtargetPredicate = isSICI in { + +defm V_CMPS_F_F32 : VOPC_F32 <"v_cmps_f_f32">; +defm V_CMPS_LT_F32 : VOPC_F32 <"v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">; +defm V_CMPS_EQ_F32 : VOPC_F32 <"v_cmps_eq_f32">; +defm V_CMPS_LE_F32 : VOPC_F32 <"v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">; +defm V_CMPS_GT_F32 : VOPC_F32 <"v_cmps_gt_f32">; +defm V_CMPS_LG_F32 : VOPC_F32 <"v_cmps_lg_f32">; +defm V_CMPS_GE_F32 : VOPC_F32 <"v_cmps_ge_f32">; +defm V_CMPS_O_F32 : VOPC_F32 <"v_cmps_o_f32">; +defm V_CMPS_U_F32 : VOPC_F32 <"v_cmps_u_f32">; +defm V_CMPS_NGE_F32 : VOPC_F32 <"v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">; +defm V_CMPS_NLG_F32 : VOPC_F32 <"v_cmps_nlg_f32">; +defm V_CMPS_NGT_F32 : VOPC_F32 <"v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">; +defm V_CMPS_NLE_F32 : VOPC_F32 <"v_cmps_nle_f32">; +defm V_CMPS_NEQ_F32 : VOPC_F32 <"v_cmps_neq_f32">; +defm V_CMPS_NLT_F32 : VOPC_F32 <"v_cmps_nlt_f32">; +defm V_CMPS_TRU_F32 : VOPC_F32 <"v_cmps_tru_f32">; + +defm V_CMPSX_F_F32 : VOPCX_F32 <"v_cmpsx_f_f32">; +defm V_CMPSX_LT_F32 : VOPCX_F32 <"v_cmpsx_lt_f32", "v_cmpsx_gt_f32">; +defm V_CMPSX_EQ_F32 : VOPCX_F32 <"v_cmpsx_eq_f32">; +defm V_CMPSX_LE_F32 : VOPCX_F32 <"v_cmpsx_le_f32", "v_cmpsx_ge_f32">; +defm V_CMPSX_GT_F32 : VOPCX_F32 <"v_cmpsx_gt_f32">; +defm V_CMPSX_LG_F32 : VOPCX_F32 <"v_cmpsx_lg_f32">; +defm V_CMPSX_GE_F32 : VOPCX_F32 <"v_cmpsx_ge_f32">; +defm V_CMPSX_O_F32 : VOPCX_F32 <"v_cmpsx_o_f32">; +defm V_CMPSX_U_F32 : VOPCX_F32 <"v_cmpsx_u_f32">; +defm V_CMPSX_NGE_F32 : VOPCX_F32 <"v_cmpsx_nge_f32", "v_cmpsx_nle_f32">; +defm V_CMPSX_NLG_F32 : VOPCX_F32 <"v_cmpsx_nlg_f32">; +defm V_CMPSX_NGT_F32 : VOPCX_F32 <"v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">; +defm V_CMPSX_NLE_F32 : VOPCX_F32 <"v_cmpsx_nle_f32">; +defm V_CMPSX_NEQ_F32 : VOPCX_F32 <"v_cmpsx_neq_f32">; +defm V_CMPSX_NLT_F32 : VOPCX_F32 <"v_cmpsx_nlt_f32">; +defm V_CMPSX_TRU_F32 : VOPCX_F32 <"v_cmpsx_tru_f32">; + +defm V_CMPS_F_F64 : VOPC_F64 <"v_cmps_f_f64">; +defm V_CMPS_LT_F64 : VOPC_F64 <"v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">; +defm V_CMPS_EQ_F64 : VOPC_F64 <"v_cmps_eq_f64">; +defm V_CMPS_LE_F64 : VOPC_F64 <"v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">; +defm V_CMPS_GT_F64 : VOPC_F64 <"v_cmps_gt_f64">; +defm V_CMPS_LG_F64 : VOPC_F64 <"v_cmps_lg_f64">; +defm V_CMPS_GE_F64 : VOPC_F64 <"v_cmps_ge_f64">; +defm V_CMPS_O_F64 : VOPC_F64 <"v_cmps_o_f64">; +defm V_CMPS_U_F64 : VOPC_F64 <"v_cmps_u_f64">; +defm V_CMPS_NGE_F64 : VOPC_F64 <"v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">; +defm V_CMPS_NLG_F64 : VOPC_F64 <"v_cmps_nlg_f64">; +defm V_CMPS_NGT_F64 : VOPC_F64 <"v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">; +defm V_CMPS_NLE_F64 : VOPC_F64 <"v_cmps_nle_f64">; +defm V_CMPS_NEQ_F64 : VOPC_F64 <"v_cmps_neq_f64">; +defm V_CMPS_NLT_F64 : VOPC_F64 <"v_cmps_nlt_f64">; +defm V_CMPS_TRU_F64 : VOPC_F64 <"v_cmps_tru_f64">; + +defm V_CMPSX_F_F64 : VOPCX_F64 <"v_cmpsx_f_f64">; +defm V_CMPSX_LT_F64 : VOPCX_F64 <"v_cmpsx_lt_f64", "v_cmpsx_gt_f64">; +defm V_CMPSX_EQ_F64 : VOPCX_F64 <"v_cmpsx_eq_f64">; +defm V_CMPSX_LE_F64 : VOPCX_F64 <"v_cmpsx_le_f64", "v_cmpsx_ge_f64">; +defm V_CMPSX_GT_F64 : VOPCX_F64 <"v_cmpsx_gt_f64">; +defm V_CMPSX_LG_F64 : VOPCX_F64 <"v_cmpsx_lg_f64">; +defm V_CMPSX_GE_F64 : VOPCX_F64 <"v_cmpsx_ge_f64">; +defm V_CMPSX_O_F64 : VOPCX_F64 <"v_cmpsx_o_f64">; +defm V_CMPSX_U_F64 : VOPCX_F64 <"v_cmpsx_u_f64">; +defm V_CMPSX_NGE_F64 : VOPCX_F64 <"v_cmpsx_nge_f64", "v_cmpsx_nle_f64">; +defm V_CMPSX_NLG_F64 : VOPCX_F64 <"v_cmpsx_nlg_f64">; +defm V_CMPSX_NGT_F64 : VOPCX_F64 <"v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">; +defm V_CMPSX_NLE_F64 : VOPCX_F64 <"v_cmpsx_nle_f64">; +defm V_CMPSX_NEQ_F64 : VOPCX_F64 <"v_cmpsx_neq_f64">; +defm V_CMPSX_NLT_F64 : VOPCX_F64 <"v_cmpsx_nlt_f64">; +defm V_CMPSX_TRU_F64 : VOPCX_F64 <"v_cmpsx_tru_f64">; + +} // End SubtargetPredicate = isSICI + +let SubtargetPredicate = Has16BitInsts in { + +defm V_CMP_F_F16 : VOPC_F16 <"v_cmp_f_f16">; +defm V_CMP_LT_F16 : VOPC_F16 <"v_cmp_lt_f16", COND_OLT, "v_cmp_gt_f16">; +defm V_CMP_EQ_F16 : VOPC_F16 <"v_cmp_eq_f16", COND_OEQ>; +defm V_CMP_LE_F16 : VOPC_F16 <"v_cmp_le_f16", COND_OLE, "v_cmp_ge_f16">; +defm V_CMP_GT_F16 : VOPC_F16 <"v_cmp_gt_f16", COND_OGT>; +defm V_CMP_LG_F16 : VOPC_F16 <"v_cmp_lg_f16", COND_ONE>; +defm V_CMP_GE_F16 : VOPC_F16 <"v_cmp_ge_f16", COND_OGE>; +defm V_CMP_O_F16 : VOPC_F16 <"v_cmp_o_f16", COND_O>; +defm V_CMP_U_F16 : VOPC_F16 <"v_cmp_u_f16", COND_UO>; +defm V_CMP_NGE_F16 : VOPC_F16 <"v_cmp_nge_f16", COND_ULT, "v_cmp_nle_f16">; +defm V_CMP_NLG_F16 : VOPC_F16 <"v_cmp_nlg_f16", COND_UEQ>; +defm V_CMP_NGT_F16 : VOPC_F16 <"v_cmp_ngt_f16", COND_ULE, "v_cmp_nlt_f16">; +defm V_CMP_NLE_F16 : VOPC_F16 <"v_cmp_nle_f16", COND_UGT>; +defm V_CMP_NEQ_F16 : VOPC_F16 <"v_cmp_neq_f16", COND_UNE>; +defm V_CMP_NLT_F16 : VOPC_F16 <"v_cmp_nlt_f16", COND_UGE>; +defm V_CMP_TRU_F16 : VOPC_F16 <"v_cmp_tru_f16">; + +defm V_CMPX_F_F16 : VOPCX_F16 <"v_cmpx_f_f16">; +defm V_CMPX_LT_F16 : VOPCX_F16 <"v_cmpx_lt_f16", "v_cmpx_gt_f16">; +defm V_CMPX_EQ_F16 : VOPCX_F16 <"v_cmpx_eq_f16">; +defm V_CMPX_LE_F16 : VOPCX_F16 <"v_cmpx_le_f16", "v_cmpx_ge_f16">; +defm V_CMPX_GT_F16 : VOPCX_F16 <"v_cmpx_gt_f16">; +defm V_CMPX_LG_F16 : VOPCX_F16 <"v_cmpx_lg_f16">; +defm V_CMPX_GE_F16 : VOPCX_F16 <"v_cmpx_ge_f16">; +defm V_CMPX_O_F16 : VOPCX_F16 <"v_cmpx_o_f16">; +defm V_CMPX_U_F16 : VOPCX_F16 <"v_cmpx_u_f16">; +defm V_CMPX_NGE_F16 : VOPCX_F16 <"v_cmpx_nge_f16", "v_cmpx_nle_f16">; +defm V_CMPX_NLG_F16 : VOPCX_F16 <"v_cmpx_nlg_f16">; +defm V_CMPX_NGT_F16 : VOPCX_F16 <"v_cmpx_ngt_f16", "v_cmpx_nlt_f16">; +defm V_CMPX_NLE_F16 : VOPCX_F16 <"v_cmpx_nle_f16">; +defm V_CMPX_NEQ_F16 : VOPCX_F16 <"v_cmpx_neq_f16">; +defm V_CMPX_NLT_F16 : VOPCX_F16 <"v_cmpx_nlt_f16">; +defm V_CMPX_TRU_F16 : VOPCX_F16 <"v_cmpx_tru_f16">; + +defm V_CMP_F_I16 : VOPC_I16 <"v_cmp_f_i16">; +defm V_CMP_LT_I16 : VOPC_I16 <"v_cmp_lt_i16", COND_SLT, "v_cmp_gt_i16">; +defm V_CMP_EQ_I16 : VOPC_I16 <"v_cmp_eq_i16">; +defm V_CMP_LE_I16 : VOPC_I16 <"v_cmp_le_i16", COND_SLE, "v_cmp_ge_i16">; +defm V_CMP_GT_I16 : VOPC_I16 <"v_cmp_gt_i16", COND_SGT>; +defm V_CMP_NE_I16 : VOPC_I16 <"v_cmp_ne_i16">; +defm V_CMP_GE_I16 : VOPC_I16 <"v_cmp_ge_i16", COND_SGE>; +defm V_CMP_T_I16 : VOPC_I16 <"v_cmp_t_i16">; + +defm V_CMP_F_U16 : VOPC_I16 <"v_cmp_f_u16">; +defm V_CMP_LT_U16 : VOPC_I16 <"v_cmp_lt_u16", COND_ULT, "v_cmp_gt_u16">; +defm V_CMP_EQ_U16 : VOPC_I16 <"v_cmp_eq_u16", COND_EQ>; +defm V_CMP_LE_U16 : VOPC_I16 <"v_cmp_le_u16", COND_ULE, "v_cmp_ge_u16">; +defm V_CMP_GT_U16 : VOPC_I16 <"v_cmp_gt_u16", COND_UGT>; +defm V_CMP_NE_U16 : VOPC_I16 <"v_cmp_ne_u16", COND_NE>; +defm V_CMP_GE_U16 : VOPC_I16 <"v_cmp_ge_u16", COND_UGE>; +defm V_CMP_T_U16 : VOPC_I16 <"v_cmp_t_u16">; + +defm V_CMPX_F_I16 : VOPCX_I16 <"v_cmpx_f_i16">; +defm V_CMPX_LT_I16 : VOPCX_I16 <"v_cmpx_lt_i16", "v_cmpx_gt_i16">; +defm V_CMPX_EQ_I16 : VOPCX_I16 <"v_cmpx_eq_i16">; +defm V_CMPX_LE_I16 : VOPCX_I16 <"v_cmpx_le_i16", "v_cmpx_ge_i16">; +defm V_CMPX_GT_I16 : VOPCX_I16 <"v_cmpx_gt_i16">; +defm V_CMPX_NE_I16 : VOPCX_I16 <"v_cmpx_ne_i16">; +defm V_CMPX_GE_I16 : VOPCX_I16 <"v_cmpx_ge_i16">; +defm V_CMPX_T_I16 : VOPCX_I16 <"v_cmpx_t_i16">; +defm V_CMPX_F_U16 : VOPCX_I16 <"v_cmpx_f_u16">; + +defm V_CMPX_LT_U16 : VOPCX_I16 <"v_cmpx_lt_u16", "v_cmpx_gt_u16">; +defm V_CMPX_EQ_U16 : VOPCX_I16 <"v_cmpx_eq_u16">; +defm V_CMPX_LE_U16 : VOPCX_I16 <"v_cmpx_le_u16", "v_cmpx_ge_u16">; +defm V_CMPX_GT_U16 : VOPCX_I16 <"v_cmpx_gt_u16">; +defm V_CMPX_NE_U16 : VOPCX_I16 <"v_cmpx_ne_u16">; +defm V_CMPX_GE_U16 : VOPCX_I16 <"v_cmpx_ge_u16">; +defm V_CMPX_T_U16 : VOPCX_I16 <"v_cmpx_t_u16">; + +} // End SubtargetPredicate = Has16BitInsts + +defm V_CMP_F_I32 : VOPC_I32 <"v_cmp_f_i32">; +defm V_CMP_LT_I32 : VOPC_I32 <"v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; +defm V_CMP_EQ_I32 : VOPC_I32 <"v_cmp_eq_i32">; +defm V_CMP_LE_I32 : VOPC_I32 <"v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">; +defm V_CMP_GT_I32 : VOPC_I32 <"v_cmp_gt_i32", COND_SGT>; +defm V_CMP_NE_I32 : VOPC_I32 <"v_cmp_ne_i32">; +defm V_CMP_GE_I32 : VOPC_I32 <"v_cmp_ge_i32", COND_SGE>; +defm V_CMP_T_I32 : VOPC_I32 <"v_cmp_t_i32">; + +defm V_CMPX_F_I32 : VOPCX_I32 <"v_cmpx_f_i32">; +defm V_CMPX_LT_I32 : VOPCX_I32 <"v_cmpx_lt_i32", "v_cmpx_gt_i32">; +defm V_CMPX_EQ_I32 : VOPCX_I32 <"v_cmpx_eq_i32">; +defm V_CMPX_LE_I32 : VOPCX_I32 <"v_cmpx_le_i32", "v_cmpx_ge_i32">; +defm V_CMPX_GT_I32 : VOPCX_I32 <"v_cmpx_gt_i32">; +defm V_CMPX_NE_I32 : VOPCX_I32 <"v_cmpx_ne_i32">; +defm V_CMPX_GE_I32 : VOPCX_I32 <"v_cmpx_ge_i32">; +defm V_CMPX_T_I32 : VOPCX_I32 <"v_cmpx_t_i32">; + +defm V_CMP_F_I64 : VOPC_I64 <"v_cmp_f_i64">; +defm V_CMP_LT_I64 : VOPC_I64 <"v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">; +defm V_CMP_EQ_I64 : VOPC_I64 <"v_cmp_eq_i64">; +defm V_CMP_LE_I64 : VOPC_I64 <"v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">; +defm V_CMP_GT_I64 : VOPC_I64 <"v_cmp_gt_i64", COND_SGT>; +defm V_CMP_NE_I64 : VOPC_I64 <"v_cmp_ne_i64">; +defm V_CMP_GE_I64 : VOPC_I64 <"v_cmp_ge_i64", COND_SGE>; +defm V_CMP_T_I64 : VOPC_I64 <"v_cmp_t_i64">; + +defm V_CMPX_F_I64 : VOPCX_I64 <"v_cmpx_f_i64">; +defm V_CMPX_LT_I64 : VOPCX_I64 <"v_cmpx_lt_i64", "v_cmpx_gt_i64">; +defm V_CMPX_EQ_I64 : VOPCX_I64 <"v_cmpx_eq_i64">; +defm V_CMPX_LE_I64 : VOPCX_I64 <"v_cmpx_le_i64", "v_cmpx_ge_i64">; +defm V_CMPX_GT_I64 : VOPCX_I64 <"v_cmpx_gt_i64">; +defm V_CMPX_NE_I64 : VOPCX_I64 <"v_cmpx_ne_i64">; +defm V_CMPX_GE_I64 : VOPCX_I64 <"v_cmpx_ge_i64">; +defm V_CMPX_T_I64 : VOPCX_I64 <"v_cmpx_t_i64">; + +defm V_CMP_F_U32 : VOPC_I32 <"v_cmp_f_u32">; +defm V_CMP_LT_U32 : VOPC_I32 <"v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">; +defm V_CMP_EQ_U32 : VOPC_I32 <"v_cmp_eq_u32", COND_EQ>; +defm V_CMP_LE_U32 : VOPC_I32 <"v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">; +defm V_CMP_GT_U32 : VOPC_I32 <"v_cmp_gt_u32", COND_UGT>; +defm V_CMP_NE_U32 : VOPC_I32 <"v_cmp_ne_u32", COND_NE>; +defm V_CMP_GE_U32 : VOPC_I32 <"v_cmp_ge_u32", COND_UGE>; +defm V_CMP_T_U32 : VOPC_I32 <"v_cmp_t_u32">; + +defm V_CMPX_F_U32 : VOPCX_I32 <"v_cmpx_f_u32">; +defm V_CMPX_LT_U32 : VOPCX_I32 <"v_cmpx_lt_u32", "v_cmpx_gt_u32">; +defm V_CMPX_EQ_U32 : VOPCX_I32 <"v_cmpx_eq_u32">; +defm V_CMPX_LE_U32 : VOPCX_I32 <"v_cmpx_le_u32", "v_cmpx_le_u32">; +defm V_CMPX_GT_U32 : VOPCX_I32 <"v_cmpx_gt_u32">; +defm V_CMPX_NE_U32 : VOPCX_I32 <"v_cmpx_ne_u32">; +defm V_CMPX_GE_U32 : VOPCX_I32 <"v_cmpx_ge_u32">; +defm V_CMPX_T_U32 : VOPCX_I32 <"v_cmpx_t_u32">; + +defm V_CMP_F_U64 : VOPC_I64 <"v_cmp_f_u64">; +defm V_CMP_LT_U64 : VOPC_I64 <"v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">; +defm V_CMP_EQ_U64 : VOPC_I64 <"v_cmp_eq_u64", COND_EQ>; +defm V_CMP_LE_U64 : VOPC_I64 <"v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">; +defm V_CMP_GT_U64 : VOPC_I64 <"v_cmp_gt_u64", COND_UGT>; +defm V_CMP_NE_U64 : VOPC_I64 <"v_cmp_ne_u64", COND_NE>; +defm V_CMP_GE_U64 : VOPC_I64 <"v_cmp_ge_u64", COND_UGE>; +defm V_CMP_T_U64 : VOPC_I64 <"v_cmp_t_u64">; + +defm V_CMPX_F_U64 : VOPCX_I64 <"v_cmpx_f_u64">; +defm V_CMPX_LT_U64 : VOPCX_I64 <"v_cmpx_lt_u64", "v_cmpx_gt_u64">; +defm V_CMPX_EQ_U64 : VOPCX_I64 <"v_cmpx_eq_u64">; +defm V_CMPX_LE_U64 : VOPCX_I64 <"v_cmpx_le_u64", "v_cmpx_ge_u64">; +defm V_CMPX_GT_U64 : VOPCX_I64 <"v_cmpx_gt_u64">; +defm V_CMPX_NE_U64 : VOPCX_I64 <"v_cmpx_ne_u64">; +defm V_CMPX_GE_U64 : VOPCX_I64 <"v_cmpx_ge_u64">; +defm V_CMPX_T_U64 : VOPCX_I64 <"v_cmpx_t_u64">; + +//===----------------------------------------------------------------------===// +// Class instructions +//===----------------------------------------------------------------------===// + +class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> : + VOPC_Profile<sched, vt, i32> { + let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + let Asm64 = "$sdst, $src0_modifiers, $src1"; + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, + clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel); + let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel"; + let HasSrc1Mods = 0; + let HasClamp = 0; + let HasOMod = 0; +} + +class getVOPCClassPat64 <VOPProfile P> { + list<dag> ret = + [(set i1:$sdst, + (AMDGPUfp_class + (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), + P.Src1VT:$src1))]; +} + +// Special case for class instructions which only have modifiers on +// the 1st source operand. +multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> { + def _e32 : VOPC_Pseudo <opName, p> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = p.Schedule; + let isConvergent = DefExec; + } + + def _e64 : VOP3_Pseudo<opName, p, getVOPCClassPat64<p>.ret> { + let Defs = !if(DefExec, [EXEC], []); + let SchedRW = p.Schedule; + } + + def _sdwa : VOPC_SDWA_Pseudo <opName, p> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = p.Schedule; + let isConvergent = DefExec; + } +} + +def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>; +def VOPC_I1_F32_I32 : VOPC_Class_Profile<[Write32Bit], f32>; +def VOPC_I1_F64_I32 : VOPC_Class_Profile<[WriteDoubleAdd], f64>; + +multiclass VOPC_CLASS_F16 <string opName> : + VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>; + +multiclass VOPCX_CLASS_F16 <string opName> : + VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>; + +multiclass VOPC_CLASS_F32 <string opName> : + VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>; + +multiclass VOPCX_CLASS_F32 <string opName> : + VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>; + +multiclass VOPC_CLASS_F64 <string opName> : + VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 0>; + +multiclass VOPCX_CLASS_F64 <string opName> : + VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 1>; + +defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">; +defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">; +defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">; +defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">; +defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">; +defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; + +//===----------------------------------------------------------------------===// +// V_ICMPIntrinsic Pattern. +//===----------------------------------------------------------------------===// + +let Predicates = [isGCN] in { + +class ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat < + (AMDGPUsetcc vt:$src0, vt:$src1, cond), + (inst $src0, $src1) +>; + +def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>; +def : ICMP_Pattern <COND_NE, V_CMP_NE_U32_e64, i32>; +def : ICMP_Pattern <COND_UGT, V_CMP_GT_U32_e64, i32>; +def : ICMP_Pattern <COND_UGE, V_CMP_GE_U32_e64, i32>; +def : ICMP_Pattern <COND_ULT, V_CMP_LT_U32_e64, i32>; +def : ICMP_Pattern <COND_ULE, V_CMP_LE_U32_e64, i32>; +def : ICMP_Pattern <COND_SGT, V_CMP_GT_I32_e64, i32>; +def : ICMP_Pattern <COND_SGE, V_CMP_GE_I32_e64, i32>; +def : ICMP_Pattern <COND_SLT, V_CMP_LT_I32_e64, i32>; +def : ICMP_Pattern <COND_SLE, V_CMP_LE_I32_e64, i32>; + +def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U64_e64, i64>; +def : ICMP_Pattern <COND_NE, V_CMP_NE_U64_e64, i64>; +def : ICMP_Pattern <COND_UGT, V_CMP_GT_U64_e64, i64>; +def : ICMP_Pattern <COND_UGE, V_CMP_GE_U64_e64, i64>; +def : ICMP_Pattern <COND_ULT, V_CMP_LT_U64_e64, i64>; +def : ICMP_Pattern <COND_ULE, V_CMP_LE_U64_e64, i64>; +def : ICMP_Pattern <COND_SGT, V_CMP_GT_I64_e64, i64>; +def : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>; +def : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>; +def : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>; + +class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat < + (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), + (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), + (inst $src0_modifiers, $src0, $src1_modifiers, $src1, + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>; +def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F32_e64, f32>; +def : FCMP_Pattern <COND_OGT, V_CMP_GT_F32_e64, f32>; +def : FCMP_Pattern <COND_OGE, V_CMP_GE_F32_e64, f32>; +def : FCMP_Pattern <COND_OLT, V_CMP_LT_F32_e64, f32>; +def : FCMP_Pattern <COND_OLE, V_CMP_LE_F32_e64, f32>; + +def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F64_e64, f64>; +def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F64_e64, f64>; +def : FCMP_Pattern <COND_OGT, V_CMP_GT_F64_e64, f64>; +def : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>; +def : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>; +def : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>; + +def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>; +def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>; +def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>; +def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F32_e64, f32>; +def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F32_e64, f32>; +def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F32_e64, f32>; + +def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F64_e64, f64>; +def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F64_e64, f64>; +def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F64_e64, f64>; +def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>; +def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>; +def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>; + +} // End Predicates = [isGCN] + +//===----------------------------------------------------------------------===// +// Target +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SI +//===----------------------------------------------------------------------===// + +multiclass VOPC_Real_si <bits<9> op> { + let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in { + def _e32_si : + VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, + VOPCe<op{7-0}>; + + def _e64_si : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3a_si <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 + // Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst + bits<8> sdst; + let Inst{7-0} = sdst; + } + } + def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"), + !cast<Instruction>(NAME#"_e32_si")> { + let AssemblerPredicate = isSICI; + } +} + +defm V_CMP_F_F32 : VOPC_Real_si <0x0>; +defm V_CMP_LT_F32 : VOPC_Real_si <0x1>; +defm V_CMP_EQ_F32 : VOPC_Real_si <0x2>; +defm V_CMP_LE_F32 : VOPC_Real_si <0x3>; +defm V_CMP_GT_F32 : VOPC_Real_si <0x4>; +defm V_CMP_LG_F32 : VOPC_Real_si <0x5>; +defm V_CMP_GE_F32 : VOPC_Real_si <0x6>; +defm V_CMP_O_F32 : VOPC_Real_si <0x7>; +defm V_CMP_U_F32 : VOPC_Real_si <0x8>; +defm V_CMP_NGE_F32 : VOPC_Real_si <0x9>; +defm V_CMP_NLG_F32 : VOPC_Real_si <0xa>; +defm V_CMP_NGT_F32 : VOPC_Real_si <0xb>; +defm V_CMP_NLE_F32 : VOPC_Real_si <0xc>; +defm V_CMP_NEQ_F32 : VOPC_Real_si <0xd>; +defm V_CMP_NLT_F32 : VOPC_Real_si <0xe>; +defm V_CMP_TRU_F32 : VOPC_Real_si <0xf>; + +defm V_CMPX_F_F32 : VOPC_Real_si <0x10>; +defm V_CMPX_LT_F32 : VOPC_Real_si <0x11>; +defm V_CMPX_EQ_F32 : VOPC_Real_si <0x12>; +defm V_CMPX_LE_F32 : VOPC_Real_si <0x13>; +defm V_CMPX_GT_F32 : VOPC_Real_si <0x14>; +defm V_CMPX_LG_F32 : VOPC_Real_si <0x15>; +defm V_CMPX_GE_F32 : VOPC_Real_si <0x16>; +defm V_CMPX_O_F32 : VOPC_Real_si <0x17>; +defm V_CMPX_U_F32 : VOPC_Real_si <0x18>; +defm V_CMPX_NGE_F32 : VOPC_Real_si <0x19>; +defm V_CMPX_NLG_F32 : VOPC_Real_si <0x1a>; +defm V_CMPX_NGT_F32 : VOPC_Real_si <0x1b>; +defm V_CMPX_NLE_F32 : VOPC_Real_si <0x1c>; +defm V_CMPX_NEQ_F32 : VOPC_Real_si <0x1d>; +defm V_CMPX_NLT_F32 : VOPC_Real_si <0x1e>; +defm V_CMPX_TRU_F32 : VOPC_Real_si <0x1f>; + +defm V_CMP_F_F64 : VOPC_Real_si <0x20>; +defm V_CMP_LT_F64 : VOPC_Real_si <0x21>; +defm V_CMP_EQ_F64 : VOPC_Real_si <0x22>; +defm V_CMP_LE_F64 : VOPC_Real_si <0x23>; +defm V_CMP_GT_F64 : VOPC_Real_si <0x24>; +defm V_CMP_LG_F64 : VOPC_Real_si <0x25>; +defm V_CMP_GE_F64 : VOPC_Real_si <0x26>; +defm V_CMP_O_F64 : VOPC_Real_si <0x27>; +defm V_CMP_U_F64 : VOPC_Real_si <0x28>; +defm V_CMP_NGE_F64 : VOPC_Real_si <0x29>; +defm V_CMP_NLG_F64 : VOPC_Real_si <0x2a>; +defm V_CMP_NGT_F64 : VOPC_Real_si <0x2b>; +defm V_CMP_NLE_F64 : VOPC_Real_si <0x2c>; +defm V_CMP_NEQ_F64 : VOPC_Real_si <0x2d>; +defm V_CMP_NLT_F64 : VOPC_Real_si <0x2e>; +defm V_CMP_TRU_F64 : VOPC_Real_si <0x2f>; + +defm V_CMPX_F_F64 : VOPC_Real_si <0x30>; +defm V_CMPX_LT_F64 : VOPC_Real_si <0x31>; +defm V_CMPX_EQ_F64 : VOPC_Real_si <0x32>; +defm V_CMPX_LE_F64 : VOPC_Real_si <0x33>; +defm V_CMPX_GT_F64 : VOPC_Real_si <0x34>; +defm V_CMPX_LG_F64 : VOPC_Real_si <0x35>; +defm V_CMPX_GE_F64 : VOPC_Real_si <0x36>; +defm V_CMPX_O_F64 : VOPC_Real_si <0x37>; +defm V_CMPX_U_F64 : VOPC_Real_si <0x38>; +defm V_CMPX_NGE_F64 : VOPC_Real_si <0x39>; +defm V_CMPX_NLG_F64 : VOPC_Real_si <0x3a>; +defm V_CMPX_NGT_F64 : VOPC_Real_si <0x3b>; +defm V_CMPX_NLE_F64 : VOPC_Real_si <0x3c>; +defm V_CMPX_NEQ_F64 : VOPC_Real_si <0x3d>; +defm V_CMPX_NLT_F64 : VOPC_Real_si <0x3e>; +defm V_CMPX_TRU_F64 : VOPC_Real_si <0x3f>; + +defm V_CMPS_F_F32 : VOPC_Real_si <0x40>; +defm V_CMPS_LT_F32 : VOPC_Real_si <0x41>; +defm V_CMPS_EQ_F32 : VOPC_Real_si <0x42>; +defm V_CMPS_LE_F32 : VOPC_Real_si <0x43>; +defm V_CMPS_GT_F32 : VOPC_Real_si <0x44>; +defm V_CMPS_LG_F32 : VOPC_Real_si <0x45>; +defm V_CMPS_GE_F32 : VOPC_Real_si <0x46>; +defm V_CMPS_O_F32 : VOPC_Real_si <0x47>; +defm V_CMPS_U_F32 : VOPC_Real_si <0x48>; +defm V_CMPS_NGE_F32 : VOPC_Real_si <0x49>; +defm V_CMPS_NLG_F32 : VOPC_Real_si <0x4a>; +defm V_CMPS_NGT_F32 : VOPC_Real_si <0x4b>; +defm V_CMPS_NLE_F32 : VOPC_Real_si <0x4c>; +defm V_CMPS_NEQ_F32 : VOPC_Real_si <0x4d>; +defm V_CMPS_NLT_F32 : VOPC_Real_si <0x4e>; +defm V_CMPS_TRU_F32 : VOPC_Real_si <0x4f>; + +defm V_CMPSX_F_F32 : VOPC_Real_si <0x50>; +defm V_CMPSX_LT_F32 : VOPC_Real_si <0x51>; +defm V_CMPSX_EQ_F32 : VOPC_Real_si <0x52>; +defm V_CMPSX_LE_F32 : VOPC_Real_si <0x53>; +defm V_CMPSX_GT_F32 : VOPC_Real_si <0x54>; +defm V_CMPSX_LG_F32 : VOPC_Real_si <0x55>; +defm V_CMPSX_GE_F32 : VOPC_Real_si <0x56>; +defm V_CMPSX_O_F32 : VOPC_Real_si <0x57>; +defm V_CMPSX_U_F32 : VOPC_Real_si <0x58>; +defm V_CMPSX_NGE_F32 : VOPC_Real_si <0x59>; +defm V_CMPSX_NLG_F32 : VOPC_Real_si <0x5a>; +defm V_CMPSX_NGT_F32 : VOPC_Real_si <0x5b>; +defm V_CMPSX_NLE_F32 : VOPC_Real_si <0x5c>; +defm V_CMPSX_NEQ_F32 : VOPC_Real_si <0x5d>; +defm V_CMPSX_NLT_F32 : VOPC_Real_si <0x5e>; +defm V_CMPSX_TRU_F32 : VOPC_Real_si <0x5f>; + +defm V_CMPS_F_F64 : VOPC_Real_si <0x60>; +defm V_CMPS_LT_F64 : VOPC_Real_si <0x61>; +defm V_CMPS_EQ_F64 : VOPC_Real_si <0x62>; +defm V_CMPS_LE_F64 : VOPC_Real_si <0x63>; +defm V_CMPS_GT_F64 : VOPC_Real_si <0x64>; +defm V_CMPS_LG_F64 : VOPC_Real_si <0x65>; +defm V_CMPS_GE_F64 : VOPC_Real_si <0x66>; +defm V_CMPS_O_F64 : VOPC_Real_si <0x67>; +defm V_CMPS_U_F64 : VOPC_Real_si <0x68>; +defm V_CMPS_NGE_F64 : VOPC_Real_si <0x69>; +defm V_CMPS_NLG_F64 : VOPC_Real_si <0x6a>; +defm V_CMPS_NGT_F64 : VOPC_Real_si <0x6b>; +defm V_CMPS_NLE_F64 : VOPC_Real_si <0x6c>; +defm V_CMPS_NEQ_F64 : VOPC_Real_si <0x6d>; +defm V_CMPS_NLT_F64 : VOPC_Real_si <0x6e>; +defm V_CMPS_TRU_F64 : VOPC_Real_si <0x6f>; + +defm V_CMPSX_F_F64 : VOPC_Real_si <0x70>; +defm V_CMPSX_LT_F64 : VOPC_Real_si <0x71>; +defm V_CMPSX_EQ_F64 : VOPC_Real_si <0x72>; +defm V_CMPSX_LE_F64 : VOPC_Real_si <0x73>; +defm V_CMPSX_GT_F64 : VOPC_Real_si <0x74>; +defm V_CMPSX_LG_F64 : VOPC_Real_si <0x75>; +defm V_CMPSX_GE_F64 : VOPC_Real_si <0x76>; +defm V_CMPSX_O_F64 : VOPC_Real_si <0x77>; +defm V_CMPSX_U_F64 : VOPC_Real_si <0x78>; +defm V_CMPSX_NGE_F64 : VOPC_Real_si <0x79>; +defm V_CMPSX_NLG_F64 : VOPC_Real_si <0x7a>; +defm V_CMPSX_NGT_F64 : VOPC_Real_si <0x7b>; +defm V_CMPSX_NLE_F64 : VOPC_Real_si <0x7c>; +defm V_CMPSX_NEQ_F64 : VOPC_Real_si <0x7d>; +defm V_CMPSX_NLT_F64 : VOPC_Real_si <0x7e>; +defm V_CMPSX_TRU_F64 : VOPC_Real_si <0x7f>; + +defm V_CMP_F_I32 : VOPC_Real_si <0x80>; +defm V_CMP_LT_I32 : VOPC_Real_si <0x81>; +defm V_CMP_EQ_I32 : VOPC_Real_si <0x82>; +defm V_CMP_LE_I32 : VOPC_Real_si <0x83>; +defm V_CMP_GT_I32 : VOPC_Real_si <0x84>; +defm V_CMP_NE_I32 : VOPC_Real_si <0x85>; +defm V_CMP_GE_I32 : VOPC_Real_si <0x86>; +defm V_CMP_T_I32 : VOPC_Real_si <0x87>; + +defm V_CMPX_F_I32 : VOPC_Real_si <0x90>; +defm V_CMPX_LT_I32 : VOPC_Real_si <0x91>; +defm V_CMPX_EQ_I32 : VOPC_Real_si <0x92>; +defm V_CMPX_LE_I32 : VOPC_Real_si <0x93>; +defm V_CMPX_GT_I32 : VOPC_Real_si <0x94>; +defm V_CMPX_NE_I32 : VOPC_Real_si <0x95>; +defm V_CMPX_GE_I32 : VOPC_Real_si <0x96>; +defm V_CMPX_T_I32 : VOPC_Real_si <0x97>; + +defm V_CMP_F_I64 : VOPC_Real_si <0xa0>; +defm V_CMP_LT_I64 : VOPC_Real_si <0xa1>; +defm V_CMP_EQ_I64 : VOPC_Real_si <0xa2>; +defm V_CMP_LE_I64 : VOPC_Real_si <0xa3>; +defm V_CMP_GT_I64 : VOPC_Real_si <0xa4>; +defm V_CMP_NE_I64 : VOPC_Real_si <0xa5>; +defm V_CMP_GE_I64 : VOPC_Real_si <0xa6>; +defm V_CMP_T_I64 : VOPC_Real_si <0xa7>; + +defm V_CMPX_F_I64 : VOPC_Real_si <0xb0>; +defm V_CMPX_LT_I64 : VOPC_Real_si <0xb1>; +defm V_CMPX_EQ_I64 : VOPC_Real_si <0xb2>; +defm V_CMPX_LE_I64 : VOPC_Real_si <0xb3>; +defm V_CMPX_GT_I64 : VOPC_Real_si <0xb4>; +defm V_CMPX_NE_I64 : VOPC_Real_si <0xb5>; +defm V_CMPX_GE_I64 : VOPC_Real_si <0xb6>; +defm V_CMPX_T_I64 : VOPC_Real_si <0xb7>; + +defm V_CMP_F_U32 : VOPC_Real_si <0xc0>; +defm V_CMP_LT_U32 : VOPC_Real_si <0xc1>; +defm V_CMP_EQ_U32 : VOPC_Real_si <0xc2>; +defm V_CMP_LE_U32 : VOPC_Real_si <0xc3>; +defm V_CMP_GT_U32 : VOPC_Real_si <0xc4>; +defm V_CMP_NE_U32 : VOPC_Real_si <0xc5>; +defm V_CMP_GE_U32 : VOPC_Real_si <0xc6>; +defm V_CMP_T_U32 : VOPC_Real_si <0xc7>; + +defm V_CMPX_F_U32 : VOPC_Real_si <0xd0>; +defm V_CMPX_LT_U32 : VOPC_Real_si <0xd1>; +defm V_CMPX_EQ_U32 : VOPC_Real_si <0xd2>; +defm V_CMPX_LE_U32 : VOPC_Real_si <0xd3>; +defm V_CMPX_GT_U32 : VOPC_Real_si <0xd4>; +defm V_CMPX_NE_U32 : VOPC_Real_si <0xd5>; +defm V_CMPX_GE_U32 : VOPC_Real_si <0xd6>; +defm V_CMPX_T_U32 : VOPC_Real_si <0xd7>; + +defm V_CMP_F_U64 : VOPC_Real_si <0xe0>; +defm V_CMP_LT_U64 : VOPC_Real_si <0xe1>; +defm V_CMP_EQ_U64 : VOPC_Real_si <0xe2>; +defm V_CMP_LE_U64 : VOPC_Real_si <0xe3>; +defm V_CMP_GT_U64 : VOPC_Real_si <0xe4>; +defm V_CMP_NE_U64 : VOPC_Real_si <0xe5>; +defm V_CMP_GE_U64 : VOPC_Real_si <0xe6>; +defm V_CMP_T_U64 : VOPC_Real_si <0xe7>; + +defm V_CMPX_F_U64 : VOPC_Real_si <0xf0>; +defm V_CMPX_LT_U64 : VOPC_Real_si <0xf1>; +defm V_CMPX_EQ_U64 : VOPC_Real_si <0xf2>; +defm V_CMPX_LE_U64 : VOPC_Real_si <0xf3>; +defm V_CMPX_GT_U64 : VOPC_Real_si <0xf4>; +defm V_CMPX_NE_U64 : VOPC_Real_si <0xf5>; +defm V_CMPX_GE_U64 : VOPC_Real_si <0xf6>; +defm V_CMPX_T_U64 : VOPC_Real_si <0xf7>; + +defm V_CMP_CLASS_F32 : VOPC_Real_si <0x88>; +defm V_CMPX_CLASS_F32 : VOPC_Real_si <0x98>; +defm V_CMP_CLASS_F64 : VOPC_Real_si <0xa8>; +defm V_CMPX_CLASS_F64 : VOPC_Real_si <0xb8>; + +//===----------------------------------------------------------------------===// +// VI +//===----------------------------------------------------------------------===// + +multiclass VOPC_Real_vi <bits<10> op> { + let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { + def _e32_vi : + VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>, + VOPCe<op{7-0}>; + + def _e64_vi : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, + VOP3a_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 + // Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst + bits<8> sdst; + let Inst{7-0} = sdst; + } + } + + def _sdwa_vi : + VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, + VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + + def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"), + !cast<Instruction>(NAME#"_e32_vi")> { + let AssemblerPredicate = isVI; + } +} + +defm V_CMP_CLASS_F32 : VOPC_Real_vi <0x10>; +defm V_CMPX_CLASS_F32 : VOPC_Real_vi <0x11>; +defm V_CMP_CLASS_F64 : VOPC_Real_vi <0x12>; +defm V_CMPX_CLASS_F64 : VOPC_Real_vi <0x13>; +defm V_CMP_CLASS_F16 : VOPC_Real_vi <0x14>; +defm V_CMPX_CLASS_F16 : VOPC_Real_vi <0x15>; + +defm V_CMP_F_F16 : VOPC_Real_vi <0x20>; +defm V_CMP_LT_F16 : VOPC_Real_vi <0x21>; +defm V_CMP_EQ_F16 : VOPC_Real_vi <0x22>; +defm V_CMP_LE_F16 : VOPC_Real_vi <0x23>; +defm V_CMP_GT_F16 : VOPC_Real_vi <0x24>; +defm V_CMP_LG_F16 : VOPC_Real_vi <0x25>; +defm V_CMP_GE_F16 : VOPC_Real_vi <0x26>; +defm V_CMP_O_F16 : VOPC_Real_vi <0x27>; +defm V_CMP_U_F16 : VOPC_Real_vi <0x28>; +defm V_CMP_NGE_F16 : VOPC_Real_vi <0x29>; +defm V_CMP_NLG_F16 : VOPC_Real_vi <0x2a>; +defm V_CMP_NGT_F16 : VOPC_Real_vi <0x2b>; +defm V_CMP_NLE_F16 : VOPC_Real_vi <0x2c>; +defm V_CMP_NEQ_F16 : VOPC_Real_vi <0x2d>; +defm V_CMP_NLT_F16 : VOPC_Real_vi <0x2e>; +defm V_CMP_TRU_F16 : VOPC_Real_vi <0x2f>; + +defm V_CMPX_F_F16 : VOPC_Real_vi <0x30>; +defm V_CMPX_LT_F16 : VOPC_Real_vi <0x31>; +defm V_CMPX_EQ_F16 : VOPC_Real_vi <0x32>; +defm V_CMPX_LE_F16 : VOPC_Real_vi <0x33>; +defm V_CMPX_GT_F16 : VOPC_Real_vi <0x34>; +defm V_CMPX_LG_F16 : VOPC_Real_vi <0x35>; +defm V_CMPX_GE_F16 : VOPC_Real_vi <0x36>; +defm V_CMPX_O_F16 : VOPC_Real_vi <0x37>; +defm V_CMPX_U_F16 : VOPC_Real_vi <0x38>; +defm V_CMPX_NGE_F16 : VOPC_Real_vi <0x39>; +defm V_CMPX_NLG_F16 : VOPC_Real_vi <0x3a>; +defm V_CMPX_NGT_F16 : VOPC_Real_vi <0x3b>; +defm V_CMPX_NLE_F16 : VOPC_Real_vi <0x3c>; +defm V_CMPX_NEQ_F16 : VOPC_Real_vi <0x3d>; +defm V_CMPX_NLT_F16 : VOPC_Real_vi <0x3e>; +defm V_CMPX_TRU_F16 : VOPC_Real_vi <0x3f>; + +defm V_CMP_F_F32 : VOPC_Real_vi <0x40>; +defm V_CMP_LT_F32 : VOPC_Real_vi <0x41>; +defm V_CMP_EQ_F32 : VOPC_Real_vi <0x42>; +defm V_CMP_LE_F32 : VOPC_Real_vi <0x43>; +defm V_CMP_GT_F32 : VOPC_Real_vi <0x44>; +defm V_CMP_LG_F32 : VOPC_Real_vi <0x45>; +defm V_CMP_GE_F32 : VOPC_Real_vi <0x46>; +defm V_CMP_O_F32 : VOPC_Real_vi <0x47>; +defm V_CMP_U_F32 : VOPC_Real_vi <0x48>; +defm V_CMP_NGE_F32 : VOPC_Real_vi <0x49>; +defm V_CMP_NLG_F32 : VOPC_Real_vi <0x4a>; +defm V_CMP_NGT_F32 : VOPC_Real_vi <0x4b>; +defm V_CMP_NLE_F32 : VOPC_Real_vi <0x4c>; +defm V_CMP_NEQ_F32 : VOPC_Real_vi <0x4d>; +defm V_CMP_NLT_F32 : VOPC_Real_vi <0x4e>; +defm V_CMP_TRU_F32 : VOPC_Real_vi <0x4f>; + +defm V_CMPX_F_F32 : VOPC_Real_vi <0x50>; +defm V_CMPX_LT_F32 : VOPC_Real_vi <0x51>; +defm V_CMPX_EQ_F32 : VOPC_Real_vi <0x52>; +defm V_CMPX_LE_F32 : VOPC_Real_vi <0x53>; +defm V_CMPX_GT_F32 : VOPC_Real_vi <0x54>; +defm V_CMPX_LG_F32 : VOPC_Real_vi <0x55>; +defm V_CMPX_GE_F32 : VOPC_Real_vi <0x56>; +defm V_CMPX_O_F32 : VOPC_Real_vi <0x57>; +defm V_CMPX_U_F32 : VOPC_Real_vi <0x58>; +defm V_CMPX_NGE_F32 : VOPC_Real_vi <0x59>; +defm V_CMPX_NLG_F32 : VOPC_Real_vi <0x5a>; +defm V_CMPX_NGT_F32 : VOPC_Real_vi <0x5b>; +defm V_CMPX_NLE_F32 : VOPC_Real_vi <0x5c>; +defm V_CMPX_NEQ_F32 : VOPC_Real_vi <0x5d>; +defm V_CMPX_NLT_F32 : VOPC_Real_vi <0x5e>; +defm V_CMPX_TRU_F32 : VOPC_Real_vi <0x5f>; + +defm V_CMP_F_F64 : VOPC_Real_vi <0x60>; +defm V_CMP_LT_F64 : VOPC_Real_vi <0x61>; +defm V_CMP_EQ_F64 : VOPC_Real_vi <0x62>; +defm V_CMP_LE_F64 : VOPC_Real_vi <0x63>; +defm V_CMP_GT_F64 : VOPC_Real_vi <0x64>; +defm V_CMP_LG_F64 : VOPC_Real_vi <0x65>; +defm V_CMP_GE_F64 : VOPC_Real_vi <0x66>; +defm V_CMP_O_F64 : VOPC_Real_vi <0x67>; +defm V_CMP_U_F64 : VOPC_Real_vi <0x68>; +defm V_CMP_NGE_F64 : VOPC_Real_vi <0x69>; +defm V_CMP_NLG_F64 : VOPC_Real_vi <0x6a>; +defm V_CMP_NGT_F64 : VOPC_Real_vi <0x6b>; +defm V_CMP_NLE_F64 : VOPC_Real_vi <0x6c>; +defm V_CMP_NEQ_F64 : VOPC_Real_vi <0x6d>; +defm V_CMP_NLT_F64 : VOPC_Real_vi <0x6e>; +defm V_CMP_TRU_F64 : VOPC_Real_vi <0x6f>; + +defm V_CMPX_F_F64 : VOPC_Real_vi <0x70>; +defm V_CMPX_LT_F64 : VOPC_Real_vi <0x71>; +defm V_CMPX_EQ_F64 : VOPC_Real_vi <0x72>; +defm V_CMPX_LE_F64 : VOPC_Real_vi <0x73>; +defm V_CMPX_GT_F64 : VOPC_Real_vi <0x74>; +defm V_CMPX_LG_F64 : VOPC_Real_vi <0x75>; +defm V_CMPX_GE_F64 : VOPC_Real_vi <0x76>; +defm V_CMPX_O_F64 : VOPC_Real_vi <0x77>; +defm V_CMPX_U_F64 : VOPC_Real_vi <0x78>; +defm V_CMPX_NGE_F64 : VOPC_Real_vi <0x79>; +defm V_CMPX_NLG_F64 : VOPC_Real_vi <0x7a>; +defm V_CMPX_NGT_F64 : VOPC_Real_vi <0x7b>; +defm V_CMPX_NLE_F64 : VOPC_Real_vi <0x7c>; +defm V_CMPX_NEQ_F64 : VOPC_Real_vi <0x7d>; +defm V_CMPX_NLT_F64 : VOPC_Real_vi <0x7e>; +defm V_CMPX_TRU_F64 : VOPC_Real_vi <0x7f>; + +defm V_CMP_F_I16 : VOPC_Real_vi <0xa0>; +defm V_CMP_LT_I16 : VOPC_Real_vi <0xa1>; +defm V_CMP_EQ_I16 : VOPC_Real_vi <0xa2>; +defm V_CMP_LE_I16 : VOPC_Real_vi <0xa3>; +defm V_CMP_GT_I16 : VOPC_Real_vi <0xa4>; +defm V_CMP_NE_I16 : VOPC_Real_vi <0xa5>; +defm V_CMP_GE_I16 : VOPC_Real_vi <0xa6>; +defm V_CMP_T_I16 : VOPC_Real_vi <0xa7>; + +defm V_CMP_F_U16 : VOPC_Real_vi <0xa8>; +defm V_CMP_LT_U16 : VOPC_Real_vi <0xa9>; +defm V_CMP_EQ_U16 : VOPC_Real_vi <0xaa>; +defm V_CMP_LE_U16 : VOPC_Real_vi <0xab>; +defm V_CMP_GT_U16 : VOPC_Real_vi <0xac>; +defm V_CMP_NE_U16 : VOPC_Real_vi <0xad>; +defm V_CMP_GE_U16 : VOPC_Real_vi <0xae>; +defm V_CMP_T_U16 : VOPC_Real_vi <0xaf>; + +defm V_CMPX_F_I16 : VOPC_Real_vi <0xb0>; +defm V_CMPX_LT_I16 : VOPC_Real_vi <0xb1>; +defm V_CMPX_EQ_I16 : VOPC_Real_vi <0xb2>; +defm V_CMPX_LE_I16 : VOPC_Real_vi <0xb3>; +defm V_CMPX_GT_I16 : VOPC_Real_vi <0xb4>; +defm V_CMPX_NE_I16 : VOPC_Real_vi <0xb5>; +defm V_CMPX_GE_I16 : VOPC_Real_vi <0xb6>; +defm V_CMPX_T_I16 : VOPC_Real_vi <0xb7>; + +defm V_CMPX_F_U16 : VOPC_Real_vi <0xb8>; +defm V_CMPX_LT_U16 : VOPC_Real_vi <0xb9>; +defm V_CMPX_EQ_U16 : VOPC_Real_vi <0xba>; +defm V_CMPX_LE_U16 : VOPC_Real_vi <0xbb>; +defm V_CMPX_GT_U16 : VOPC_Real_vi <0xbc>; +defm V_CMPX_NE_U16 : VOPC_Real_vi <0xbd>; +defm V_CMPX_GE_U16 : VOPC_Real_vi <0xbe>; +defm V_CMPX_T_U16 : VOPC_Real_vi <0xbf>; + +defm V_CMP_F_I32 : VOPC_Real_vi <0xc0>; +defm V_CMP_LT_I32 : VOPC_Real_vi <0xc1>; +defm V_CMP_EQ_I32 : VOPC_Real_vi <0xc2>; +defm V_CMP_LE_I32 : VOPC_Real_vi <0xc3>; +defm V_CMP_GT_I32 : VOPC_Real_vi <0xc4>; +defm V_CMP_NE_I32 : VOPC_Real_vi <0xc5>; +defm V_CMP_GE_I32 : VOPC_Real_vi <0xc6>; +defm V_CMP_T_I32 : VOPC_Real_vi <0xc7>; + +defm V_CMPX_F_I32 : VOPC_Real_vi <0xd0>; +defm V_CMPX_LT_I32 : VOPC_Real_vi <0xd1>; +defm V_CMPX_EQ_I32 : VOPC_Real_vi <0xd2>; +defm V_CMPX_LE_I32 : VOPC_Real_vi <0xd3>; +defm V_CMPX_GT_I32 : VOPC_Real_vi <0xd4>; +defm V_CMPX_NE_I32 : VOPC_Real_vi <0xd5>; +defm V_CMPX_GE_I32 : VOPC_Real_vi <0xd6>; +defm V_CMPX_T_I32 : VOPC_Real_vi <0xd7>; + +defm V_CMP_F_I64 : VOPC_Real_vi <0xe0>; +defm V_CMP_LT_I64 : VOPC_Real_vi <0xe1>; +defm V_CMP_EQ_I64 : VOPC_Real_vi <0xe2>; +defm V_CMP_LE_I64 : VOPC_Real_vi <0xe3>; +defm V_CMP_GT_I64 : VOPC_Real_vi <0xe4>; +defm V_CMP_NE_I64 : VOPC_Real_vi <0xe5>; +defm V_CMP_GE_I64 : VOPC_Real_vi <0xe6>; +defm V_CMP_T_I64 : VOPC_Real_vi <0xe7>; + +defm V_CMPX_F_I64 : VOPC_Real_vi <0xf0>; +defm V_CMPX_LT_I64 : VOPC_Real_vi <0xf1>; +defm V_CMPX_EQ_I64 : VOPC_Real_vi <0xf2>; +defm V_CMPX_LE_I64 : VOPC_Real_vi <0xf3>; +defm V_CMPX_GT_I64 : VOPC_Real_vi <0xf4>; +defm V_CMPX_NE_I64 : VOPC_Real_vi <0xf5>; +defm V_CMPX_GE_I64 : VOPC_Real_vi <0xf6>; +defm V_CMPX_T_I64 : VOPC_Real_vi <0xf7>; + +defm V_CMP_F_U32 : VOPC_Real_vi <0xc8>; +defm V_CMP_LT_U32 : VOPC_Real_vi <0xc9>; +defm V_CMP_EQ_U32 : VOPC_Real_vi <0xca>; +defm V_CMP_LE_U32 : VOPC_Real_vi <0xcb>; +defm V_CMP_GT_U32 : VOPC_Real_vi <0xcc>; +defm V_CMP_NE_U32 : VOPC_Real_vi <0xcd>; +defm V_CMP_GE_U32 : VOPC_Real_vi <0xce>; +defm V_CMP_T_U32 : VOPC_Real_vi <0xcf>; + +defm V_CMPX_F_U32 : VOPC_Real_vi <0xd8>; +defm V_CMPX_LT_U32 : VOPC_Real_vi <0xd9>; +defm V_CMPX_EQ_U32 : VOPC_Real_vi <0xda>; +defm V_CMPX_LE_U32 : VOPC_Real_vi <0xdb>; +defm V_CMPX_GT_U32 : VOPC_Real_vi <0xdc>; +defm V_CMPX_NE_U32 : VOPC_Real_vi <0xdd>; +defm V_CMPX_GE_U32 : VOPC_Real_vi <0xde>; +defm V_CMPX_T_U32 : VOPC_Real_vi <0xdf>; + +defm V_CMP_F_U64 : VOPC_Real_vi <0xe8>; +defm V_CMP_LT_U64 : VOPC_Real_vi <0xe9>; +defm V_CMP_EQ_U64 : VOPC_Real_vi <0xea>; +defm V_CMP_LE_U64 : VOPC_Real_vi <0xeb>; +defm V_CMP_GT_U64 : VOPC_Real_vi <0xec>; +defm V_CMP_NE_U64 : VOPC_Real_vi <0xed>; +defm V_CMP_GE_U64 : VOPC_Real_vi <0xee>; +defm V_CMP_T_U64 : VOPC_Real_vi <0xef>; + +defm V_CMPX_F_U64 : VOPC_Real_vi <0xf8>; +defm V_CMPX_LT_U64 : VOPC_Real_vi <0xf9>; +defm V_CMPX_EQ_U64 : VOPC_Real_vi <0xfa>; +defm V_CMPX_LE_U64 : VOPC_Real_vi <0xfb>; +defm V_CMPX_GT_U64 : VOPC_Real_vi <0xfc>; +defm V_CMPX_NE_U64 : VOPC_Real_vi <0xfd>; +defm V_CMPX_GE_U64 : VOPC_Real_vi <0xfe>; +defm V_CMPX_T_U64 : VOPC_Real_vi <0xff>; diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td new file mode 100644 index 0000000..5f72f97 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -0,0 +1,350 @@ +//===-- VOPInstructions.td - Vector Instruction Defintions ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// dummies for outer let +class LetDummies { + bit isCommutable; + bit isConvertibleToThreeAddress; + bit isMoveImm; + bit isReMaterializable; + bit isAsCheapAsAMove; + bit VOPAsmPrefer32Bit; + Predicate SubtargetPredicate; + string Constraints; + string DisableEncoding; + list<SchedReadWrite> SchedRW; + list<Register> Uses; + list<Register> Defs; +} + +class VOP <string opName> { + string OpName = opName; +} + +class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VALU = 1; + let Uses = [EXEC]; +} + +class VOP3Common <dag outs, dag ins, string asm = "", + list<dag> pattern = [], bit HasMods = 0, + bit VOP3Only = 0> : + VOPAnyCommon <outs, ins, asm, pattern> { + + // Using complex patterns gives VOP3 patterns a very high complexity rating, + // but standalone patterns are almost always preferred, so we need to adjust the + // priority lower. The goal is to use a high number to reduce complexity to + // zero (or less than zero). + let AddedComplexity = -1000; + + let VOP3 = 1; + + let AsmMatchConverter = + !if(!eq(VOP3Only,1), + "cvtVOP3", + !if(!eq(HasMods,1), "cvtVOP3_2_mod", "")); + + let AsmVariantName = AMDGPUAsmVariants.VOP3; + + let isCodeGenOnly = 0; + + int Size = 8; + + // Because SGPRs may be allowed if there are multiple operands, we + // need a post-isel hook to insert copies in order to avoid + // violating constant bus requirements. + let hasPostISelHook = 1; +} + +class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3Only = 0> : + InstSI <P.Outs64, P.Ins64, "", pattern>, + VOP <opName>, + SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>, + MnemonicAlias<opName#"_e64", opName> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + string AsmOperands = P.Asm64; + + let Size = 8; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SubtargetPredicate = isGCN; + + // Because SGPRs may be allowed if there are multiple operands, we + // need a post-isel hook to insert copies in order to avoid + // violating constant bus requirements. + let hasPostISelHook = 1; + + // Using complex patterns gives VOP3 patterns a very high complexity rating, + // but standalone patterns are almost always preferred, so we need to adjust the + // priority lower. The goal is to use a high number to reduce complexity to + // zero (or less than zero). + let AddedComplexity = -1000; + + let VOP3 = 1; + let VALU = 1; + let Uses = [EXEC]; + + let AsmVariantName = AMDGPUAsmVariants.VOP3; + let AsmMatchConverter = + !if(!eq(VOP3Only,1), + "cvtVOP3", + !if(!eq(P.HasModifiers, 1), "cvtVOP3_2_mod", "")); + + VOPProfile Pfl = P; +} + +class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + SIMCInstr <ps.PseudoInstr, EncodingFamily> { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let AsmVariantName = ps.AsmVariantName; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; +} + +class VOP3a<VOPProfile P> : Enc64 { + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<1> clamp; + bits<2> omod; + + let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); + let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); + let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); + + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = !if(P.HasSrc0, src0, 0); + let Inst{49-41} = !if(P.HasSrc1, src1, 0); + let Inst{58-50} = !if(P.HasSrc2, src2, 0); + let Inst{60-59} = !if(P.HasOMod, omod, 0); + let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); + let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); + let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); +} + +class VOP3a_si <bits<9> op, VOPProfile P> : VOP3a<P> { + let Inst{25-17} = op; + let Inst{11} = !if(P.HasClamp, clamp{0}, 0); +} + +class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> { + let Inst{25-16} = op; + let Inst{15} = !if(P.HasClamp, clamp{0}, 0); +} + +class VOP3e_si <bits<9> op, VOPProfile P> : VOP3a_si <op, P> { + bits<8> vdst; + let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0); +} + +class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> { + bits<8> vdst; + let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0); +} + +class VOP3be <VOPProfile P> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<7> sdst; + bits<2> omod; + + let Inst{7-0} = vdst; + let Inst{14-8} = sdst; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = !if(P.HasSrc0, src0, 0); + let Inst{49-41} = !if(P.HasSrc1, src1, 0); + let Inst{58-50} = !if(P.HasSrc2, src2, 0); + let Inst{60-59} = !if(P.HasOMod, omod, 0); + let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); + let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); + let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); +} + +class VOP3be_si <bits<9> op, VOPProfile P> : VOP3be<P> { + let Inst{25-17} = op; +} + +class VOP3be_vi <bits<10> op, VOPProfile P> : VOP3be<P> { + bits<1> clamp; + let Inst{25-16} = op; + let Inst{15} = !if(P.HasClamp, clamp{0}, 0); +} + +def SDWA { + // sdwa_sel + int BYTE_0 = 0; + int BYTE_1 = 1; + int BYTE_2 = 2; + int BYTE_3 = 3; + int WORD_0 = 4; + int WORD_1 = 5; + int DWORD = 6; + + // dst_unused + int UNUSED_PAD = 0; + int UNUSED_SEXT = 1; + int UNUSED_PRESERVE = 2; +} + +class VOP_SDWAe<VOPProfile P> : Enc64 { + bits<8> src0; + bits<3> src0_sel; + bits<2> src0_modifiers; // float: {abs,neg}, int {sext} + bits<3> src1_sel; + bits<2> src1_modifiers; + bits<3> dst_sel; + bits<2> dst_unused; + bits<1> clamp; + + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD); + let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE); + let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); + let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); + let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); + let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); + let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD); + let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); + let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0); +} + +class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : + InstSI <P.OutsSDWA, P.InsSDWA, "", pattern>, + VOP <opName>, + SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>, + MnemonicAlias <opName#"_sdwa", opName> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + string AsmOperands = P.AsmSDWA; + + let Size = 8; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + + let VALU = 1; + let SDWA = 1; + let Uses = [EXEC]; + + let SubtargetPredicate = isVI; + let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst); + let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA, + AMDGPUAsmVariants.Disable); + let DecoderNamespace = "SDWA"; + + VOPProfile Pfl = P; +} + +class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Defs = ps.Defs; + let Uses = ps.Uses; + let SchedRW = ps.SchedRW; + let hasSideEffects = ps.hasSideEffects; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + // Copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AssemblerPredicate = ps.AssemblerPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let AsmVariantName = ps.AsmVariantName; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let DecoderNamespace = ps.DecoderNamespace; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; +} + +class VOP_DPPe<VOPProfile P> : Enc64 { + bits<2> src0_modifiers; + bits<8> src0; + bits<2> src1_modifiers; + bits<9> dpp_ctrl; + bits<1> bound_ctrl; + bits<4> bank_mask; + bits<4> row_mask; + + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{48-40} = dpp_ctrl; + let Inst{51} = bound_ctrl; + let Inst{52} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // src0_neg + let Inst{53} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // src0_abs + let Inst{54} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // src1_neg + let Inst{55} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // src1_abs + let Inst{59-56} = bank_mask; + let Inst{63-60} = row_mask; +} + +class VOP_DPP <string OpName, VOPProfile P> : + InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, []>, + VOP_DPPe<P> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + + let VALU = 1; + let DPP = 1; + let Size = 8; + + let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", ""); + let SubtargetPredicate = isVI; + let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst); + let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP, + AMDGPUAsmVariants.Disable); + let DecoderNamespace = "DPP"; +} + +include "VOPCInstructions.td" +include "VOP1Instructions.td" +include "VOP2Instructions.td" +include "VOP3Instructions.td" |