diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU')
154 files changed, 24767 insertions, 5972 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h index 7b0a7f4..5686828 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -11,6 +11,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -23,43 +24,61 @@ class Pass; class Target; class TargetMachine; class PassRegistry; +class Module; // R600 Passes -FunctionPass *createR600VectorRegMerger(TargetMachine &tm); -FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); +FunctionPass *createR600VectorRegMerger(); +FunctionPass *createR600ExpandSpecialInstrsPass(); FunctionPass *createR600EmitClauseMarkers(); -FunctionPass *createR600ClauseMergePass(TargetMachine &tm); -FunctionPass *createR600Packetizer(TargetMachine &tm); -FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm); +FunctionPass *createR600ClauseMergePass(); +FunctionPass *createR600Packetizer(); +FunctionPass *createR600ControlFlowFinalizer(); FunctionPass *createAMDGPUCFGStructurizerPass(); // SI Passes -FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); +FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); -FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); +FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); -FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr); +FunctionPass *createSIInsertWaitcntsPass(); +FunctionPass *createAMDGPUCodeGenPreparePass(); +FunctionPass *createAMDGPUMachineCFGStructurizerPass(); -ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); +void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&); +extern char &AMDGPUMachineCFGStructurizerID; + +void initializeAMDGPUAlwaysInlinePass(PassRegistry&); + +Pass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; +ModulePass *createAMDGPULowerIntrinsicsPass(); +void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); +extern char &AMDGPULowerIntrinsicsID; + void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; +void initializeSIPeepholeSDWAPass(PassRegistry &); +extern char &SIPeepholeSDWAID; + void initializeSIShrinkInstructionsPass(PassRegistry&); extern char &SIShrinkInstructionsID; void initializeSIFixSGPRCopiesPass(PassRegistry &); extern char &SIFixSGPRCopiesID; +void initializeSIFixVGPRCopiesPass(PassRegistry &); +extern char &SIFixVGPRCopiesID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; @@ -79,18 +98,18 @@ void initializeSIOptimizeExecMaskingPass(PassRegistry &); extern char &SIOptimizeExecMaskingID; // Passes common to R600 and SI -FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr); +FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); extern char &AMDGPUPromoteAllocaID; Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &TM, CodeGenOpt::Level OptLevel); -ModulePass *createAMDGPUAlwaysInlinePass(); +ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true); ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); FunctionPass *createAMDGPUAnnotateUniformValues(); -FunctionPass* createAMDGPUUnifyMetadataPass(); +ModulePass* createAMDGPUUnifyMetadataPass(); void initializeAMDGPUUnifyMetadataPass(PassRegistry&); extern char &AMDGPUUnifyMetadataID; @@ -112,6 +131,15 @@ extern char &SIDebuggerInsertNopsID; void initializeSIInsertWaitsPass(PassRegistry&); extern char &SIInsertWaitsID; +void initializeSIInsertWaitcntsPass(PassRegistry&); +extern char &SIInsertWaitcntsID; + +void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); +extern char &AMDGPUUnifyDivergentExitNodesID; + +ImmutablePass *createAMDGPUAAWrapperPass(); +void initializeAMDGPUAAWrapperPassPass(PassRegistry&); + Target &getTheAMDGPUTarget(); Target &getTheGCNTarget(); @@ -133,43 +161,53 @@ enum TargetIndex { /// however on the GPU, each address space points to /// a separate piece of memory that is unique from other /// memory locations. -namespace AMDGPUAS { -enum AddressSpaces : unsigned { - PRIVATE_ADDRESS = 0, ///< Address space for private memory. - GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2) - LOCAL_ADDRESS = 3, ///< Address space for local memory. - FLAT_ADDRESS = 4, ///< Address space for flat memory. - REGION_ADDRESS = 5, ///< Address space for region memory. - PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) - PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) +struct AMDGPUAS { + // The following address space values depend on the triple environment. + unsigned PRIVATE_ADDRESS; ///< Address space for private memory. + unsigned FLAT_ADDRESS; ///< Address space for flat memory. + unsigned REGION_ADDRESS; ///< Address space for region memory. + + // The maximum value for flat, generic, local, private, constant and region. + const static unsigned MAX_COMMON_ADDRESS = 5; + + const static unsigned GLOBAL_ADDRESS = 1; ///< Address space for global memory (RAT0, VTX0). + const static unsigned CONSTANT_ADDRESS = 2; ///< Address space for constant memory (VTX2) + const static unsigned LOCAL_ADDRESS = 3; ///< Address space for local memory. + const static unsigned PARAM_D_ADDRESS = 6; ///< Address space for direct addressible parameter memory (CONST0) + const static unsigned PARAM_I_ADDRESS = 7; ///< Address space for indirect addressible parameter memory (VTX1) // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this // order to be able to dynamically index a constant buffer, for example: // // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx - CONSTANT_BUFFER_0 = 8, - CONSTANT_BUFFER_1 = 9, - CONSTANT_BUFFER_2 = 10, - CONSTANT_BUFFER_3 = 11, - CONSTANT_BUFFER_4 = 12, - CONSTANT_BUFFER_5 = 13, - CONSTANT_BUFFER_6 = 14, - CONSTANT_BUFFER_7 = 15, - CONSTANT_BUFFER_8 = 16, - CONSTANT_BUFFER_9 = 17, - CONSTANT_BUFFER_10 = 18, - CONSTANT_BUFFER_11 = 19, - CONSTANT_BUFFER_12 = 20, - CONSTANT_BUFFER_13 = 21, - CONSTANT_BUFFER_14 = 22, - CONSTANT_BUFFER_15 = 23, + const static unsigned CONSTANT_BUFFER_0 = 8; + const static unsigned CONSTANT_BUFFER_1 = 9; + const static unsigned CONSTANT_BUFFER_2 = 10; + const static unsigned CONSTANT_BUFFER_3 = 11; + const static unsigned CONSTANT_BUFFER_4 = 12; + const static unsigned CONSTANT_BUFFER_5 = 13; + const static unsigned CONSTANT_BUFFER_6 = 14; + const static unsigned CONSTANT_BUFFER_7 = 15; + const static unsigned CONSTANT_BUFFER_8 = 16; + const static unsigned CONSTANT_BUFFER_9 = 17; + const static unsigned CONSTANT_BUFFER_10 = 18; + const static unsigned CONSTANT_BUFFER_11 = 19; + const static unsigned CONSTANT_BUFFER_12 = 20; + const static unsigned CONSTANT_BUFFER_13 = 21; + const static unsigned CONSTANT_BUFFER_14 = 22; + const static unsigned CONSTANT_BUFFER_15 = 23; // Some places use this if the address space can't be determined. - UNKNOWN_ADDRESS_SPACE = ~0u + const static unsigned UNKNOWN_ADDRESS_SPACE = ~0u; }; -} // namespace AMDGPUAS +namespace llvm { +namespace AMDGPU { +AMDGPUAS getAMDGPUAS(const Module &M); +AMDGPUAS getAMDGPUAS(const TargetMachine &TM); +AMDGPUAS getAMDGPUAS(Triple T); +} // namespace AMDGPU +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td index 1302200..f1d899c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -61,18 +61,48 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "Support flat address space" >; +def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets", + "FlatInstOffsets", + "true", + "Flat instructions have immediate offset addressing mode" +>; + +def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts", + "FlatGlobalInsts", + "true", + "Have global_* flat memory instructions" +>; + +def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts", + "FlatScratchInsts", + "true", + "Have scratch_* flat memory instructions" +>; + def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access", "UnalignedBufferAccess", "true", "Support unaligned global loads and stores" >; +def FeatureTrapHandler: SubtargetFeature<"trap-handler", + "TrapHandler", + "true", + "Trap handler support" +>; + def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access", "UnalignedScratchAccess", "true", "Support unaligned scratch loads and stores" >; +def FeatureApertureRegs : SubtargetFeature<"aperture-regs", + "HasApertureRegs", + "true", + "Has Memory Aperture Base and Size Registers" +>; + // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support // XNACK. The current default kernel driver setting is: // - graphics ring: XNACK disabled @@ -154,6 +184,12 @@ def FeatureCIInsts : SubtargetFeature<"ci-insts", "Additional intstructions for CI+" >; +def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", + "GFX9Insts", + "true", + "Additional intstructions for GFX9+" +>; + def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime", "HasSMemRealTime", "true", @@ -172,6 +208,12 @@ def Feature16BitInsts : SubtargetFeature<"16-bit-insts", "Has i16/f16 instructions" >; +def FeatureVOP3P : SubtargetFeature<"vop3p", + "HasVOP3PInsts", + "true", + "Has VOP3P packed instructions" +>; + def FeatureMovrel : SubtargetFeature<"movrel", "HasMovrel", "true", @@ -190,16 +232,52 @@ def FeatureScalarStores : SubtargetFeature<"scalar-stores", "Has store scalar memory instructions" >; -//===------------------------------------------------------------===// -// Subtarget Features (options and debugging) -//===------------------------------------------------------------===// +def FeatureSDWA : SubtargetFeature<"sdwa", + "HasSDWA", + "true", + "Support SDWA (Sub-DWORD Addressing) extension" +>; -def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals", - "FP16Denormals", +def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod", + "HasSDWAOmod", + "true", + "Support OMod with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar", + "HasSDWAScalar", + "true", + "Support scalar register with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst", + "HasSDWASdst", + "true", + "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWAMac : SubtargetFeature<"sdwa-mav", + "HasSDWAMac", "true", - "Enable half precision denormal handling" + "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension" >; +def FeatureSDWAOutModsVOPC : SubtargetFeature<"sdwa-out-mods-vopc", + "HasSDWAOutModsVOPC", + "true", + "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureDPP : SubtargetFeature<"dpp", + "HasDPP", + "true", + "Support DPP (Data Parallel Primitives) extension" +>; + +//===------------------------------------------------------------===// +// Subtarget Features (options and debugging) +//===------------------------------------------------------------===// + // Some instructions do not support denormals despite this flag. Using // fp32 denormals also causes instructions to run at the double // precision rate for the device. @@ -209,13 +287,36 @@ def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", "Enable single precision denormal handling" >; -def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", - "FP64Denormals", +// Denormal handling for fp64 and fp16 is controlled by the same +// config register when fp16 supported. +// TODO: Do we need a separate f16 setting when not legal? +def FeatureFP64FP16Denormals : SubtargetFeature<"fp64-fp16-denormals", + "FP64FP16Denormals", "true", - "Enable double precision denormal handling", + "Enable double and half precision denormal handling", [FeatureFP64] >; +def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", + "FP64FP16Denormals", + "true", + "Enable double and half precision denormal handling", + [FeatureFP64, FeatureFP64FP16Denormals] +>; + +def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals", + "FP64FP16Denormals", + "true", + "Enable half precision denormal handling", + [FeatureFP64FP16Denormals] +>; + +def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp", + "DX10Clamp", + "true", + "clamp modifier clamps NaNs to 0.0" +>; + def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", "FPExceptions", "true", @@ -294,6 +395,13 @@ def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", "Force to generate flat instruction for global" >; +def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature < + "auto-waitcnt-before-barrier", + "AutoWaitcntBeforeBarrier", + "true", + "Hardware automatically inserts waitcnt before barrier" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -343,7 +451,20 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, - FeatureScalarStores, FeatureInv2PiInlineImm + FeatureScalarStores, FeatureInv2PiInlineImm, + FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP + ] +>; + +def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", + [FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, + FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, + FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, + FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, + FeatureFastFMAF32, FeatureDPP, + FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, + FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts ] >; @@ -357,6 +478,16 @@ class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping, Implies >; +def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0, + [FeatureSouthernIslands, + FeatureFastFMAF32, + HalfRate64Ops, + FeatureLDSBankCount32]>; + +def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1, + [FeatureSouthernIslands, + FeatureLDSBankCount32]>; + def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0, [FeatureSeaIslands, FeatureLDSBankCount32]>; @@ -371,6 +502,10 @@ def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2, [FeatureSeaIslands, FeatureLDSBankCount16]>; +def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3, + [FeatureSeaIslands, + FeatureLDSBankCount16]>; + def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0, [FeatureVolcanicIslands, FeatureLDSBankCount32, @@ -399,6 +534,24 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, FeatureLDSBankCount16, FeatureXNACK]>; +def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0, + [FeatureGFX9, + FeatureLDSBankCount32]>; + +def FeatureISAVersion9_0_1 : SubtargetFeatureISAVersion <9,0,1, + [FeatureGFX9, + FeatureLDSBankCount32, + FeatureXNACK]>; + +def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2, + [FeatureGFX9, + FeatureLDSBankCount32]>; + +def FeatureISAVersion9_0_3 : SubtargetFeatureISAVersion <9,0,3, + [FeatureGFX9, + FeatureLDSBankCount32, + FeatureXNACK]>; + //===----------------------------------------------------------------------===// // Debugger related subtarget features. //===----------------------------------------------------------------------===// @@ -448,10 +601,12 @@ def AMDGPUAsmVariants { int VOP3_ID = 1; string SDWA = "SDWA"; int SDWA_ID = 2; + string SDWA9 = "SDWA9"; + int SDWA9_ID = 3; string DPP = "DPP"; - int DPP_ID = 3; + int DPP_ID = 4; string Disable = "Disable"; - int Disable_ID = 4; + int Disable_ID = 5; } def DefaultAMDGPUAsmParserVariant : AsmParserVariant { @@ -469,6 +624,12 @@ def SDWAAsmParserVariant : AsmParserVariant { let Name = AMDGPUAsmVariants.SDWA; } +def SDWA9AsmParserVariant : AsmParserVariant { + let Variant = AMDGPUAsmVariants.SDWA9_ID; + let Name = AMDGPUAsmVariants.SDWA9; +} + + def DPPAsmParserVariant : AsmParserVariant { let Variant = AMDGPUAsmVariants.DPP_ID; let Name = AMDGPUAsmVariants.DPP; @@ -481,6 +642,7 @@ def AMDGPU : Target { let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant, VOP3AsmParserVariant, SDWAAsmParserVariant, + SDWA9AsmParserVariant, DPPAsmParserVariant]; let AssemblyWriters = [AMDGPUAsmWriter]; } @@ -504,14 +666,34 @@ def isVI : Predicate < "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, AssemblerPredicate<"FeatureGCN3Encoding">; +def isGFX9 : Predicate < + "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"FeatureGFX9Insts">; + +// TODO: Either the name to be changed or we simply use IsCI! def isCIVI : Predicate < - "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " - "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" ->, AssemblerPredicate<"FeatureCIInsts">; + "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"FeatureCIInsts">; + +def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, + AssemblerPredicate<"FeatureFlatAddressSpace">; + +def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, + AssemblerPredicate<"FeatureFlatGlobalInsts">; + +def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, + AssemblerPredicate<"Feature16BitInsts">; +def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, + AssemblerPredicate<"FeatureVOP3P">; + +def HasSDWA : Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">; -def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; +def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<"FeatureSDWA,FeatureGFX9">; -def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">; +def HasDPP : Predicate<"Subtarget->hasDPP()">, + AssemblerPredicate<"FeatureDPP">; class PredicateControl { Predicate SubtargetPredicate; @@ -532,5 +714,6 @@ include "Processors.td" include "AMDGPUInstrInfo.td" include "AMDGPUIntrinsics.td" include "AMDGPURegisterInfo.td" +include "AMDGPURegisterBanks.td" include "AMDGPUInstructions.td" include "AMDGPUCallingConv.td" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp new file mode 100644 index 0000000..faa424e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -0,0 +1,147 @@ +//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This is the AMGPU address space based alias analysis pass. +//===----------------------------------------------------------------------===// + +#include "AMDGPUAliasAnalysis.h" +#include "AMDGPU.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-aa" + +// Register this pass... +char AMDGPUAAWrapperPass::ID = 0; +INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa", + "AMDGPU Address space based Alias Analysis", false, true) + +ImmutablePass *llvm::createAMDGPUAAWrapperPass() { + return new AMDGPUAAWrapperPass(); +} + +void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); +} + +// Must match the table in getAliasResult. +AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_) + : Arch(Arch_), AS(AS_) { + // These arrarys are indexed by address space value + // enum elements 0 ... to 5 + static const AliasResult ASAliasRulesPrivIsZero[6][6] = { + /* Private Global Constant Group Flat Region*/ + /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , MayAlias, NoAlias}, + /* Global */ {NoAlias , MayAlias, NoAlias , NoAlias , MayAlias, NoAlias}, + /* Constant */ {NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, NoAlias}, + /* Group */ {NoAlias , NoAlias , NoAlias , MayAlias, MayAlias, NoAlias}, + /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, + /* Region */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias} + }; + static const AliasResult ASAliasRulesGenIsZero[6][6] = { + /* Flat Global Constant Group Region Private */ + /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, + /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias}, + /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias}, + /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias}, + /* Region */ {MayAlias, NoAlias , NoAlias , NoAlias, MayAlias, NoAlias}, + /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias} + }; + assert(AS.MAX_COMMON_ADDRESS <= 5); + if (AS.FLAT_ADDRESS == 0) { + assert(AS.GLOBAL_ADDRESS == 1 && + AS.REGION_ADDRESS == 4 && + AS.LOCAL_ADDRESS == 3 && + AS.CONSTANT_ADDRESS == 2 && + AS.PRIVATE_ADDRESS == 5); + ASAliasRules = &ASAliasRulesGenIsZero; + } else { + assert(AS.PRIVATE_ADDRESS == 0 && + AS.GLOBAL_ADDRESS == 1 && + AS.CONSTANT_ADDRESS == 2 && + AS.LOCAL_ADDRESS == 3 && + AS.FLAT_ADDRESS == 4 && + AS.REGION_ADDRESS == 5); + ASAliasRules = &ASAliasRulesPrivIsZero; + } +} + +AliasResult AMDGPUAAResult::ASAliasRulesTy::getAliasResult(unsigned AS1, + unsigned AS2) const { + if (AS1 > AS.MAX_COMMON_ADDRESS || AS2 > AS.MAX_COMMON_ADDRESS) { + if (Arch == Triple::amdgcn) + report_fatal_error("Pointer address space out of range"); + return AS1 == AS2 ? MayAlias : NoAlias; + } + + return (*ASAliasRules)[AS1][AS2]; +} + +AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB) { + unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace(); + unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace(); + + AliasResult Result = ASAliasRules.getAliasResult(asA, asB); + if (Result == NoAlias) return Result; + + // Forward the query to the next alias analysis. + return AAResultBase::alias(LocA, LocB); +} + +bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, + bool OrLocal) { + const Value *Base = GetUnderlyingObject(Loc.Ptr, DL); + + if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) { + return true; + } + + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) { + if (GV->isConstant()) + return true; + } else if (const Argument *Arg = dyn_cast<Argument>(Base)) { + const Function *F = Arg->getParent(); + + // Only assume constant memory for arguments on kernels. + switch (F->getCallingConv()) { + default: + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + break; + } + + unsigned ArgNo = Arg->getArgNo(); + /* On an argument, ReadOnly attribute indicates that the function does + not write through this pointer argument, even though it may write + to the memory that the pointer points to. + On an argument, ReadNone attribute indicates that the function does + not dereference that pointer argument, even though it may read or write + the memory that the pointer points to if accessed through other pointers. + */ + if (F->hasParamAttribute(ArgNo, Attribute::NoAlias) && + (F->hasParamAttribute(ArgNo, Attribute::ReadNone) || + F->hasParamAttribute(ArgNo, Attribute::ReadOnly))) { + return true; + } + } + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h new file mode 100644 index 0000000..5f8ed9b --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h @@ -0,0 +1,102 @@ +//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This is the AMGPU address space based alias analysis pass. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H +#define LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H + +#include "AMDGPU.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +namespace llvm { + +/// A simple AA result that uses TBAA metadata to answer queries. +class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> { + friend AAResultBase<AMDGPUAAResult>; + + const DataLayout &DL; + AMDGPUAS AS; + +public: + explicit AMDGPUAAResult(const DataLayout &DL, Triple T) : AAResultBase(), + DL(DL), AS(AMDGPU::getAMDGPUAS(T)), ASAliasRules(AS, T.getArch()) {} + AMDGPUAAResult(AMDGPUAAResult &&Arg) + : AAResultBase(std::move(Arg)), DL(Arg.DL), AS(Arg.AS), + ASAliasRules(Arg.ASAliasRules){} + + /// Handle invalidation events from the new pass manager. + /// + /// By definition, this result is stateless and so remains valid. + bool invalidate(Function &, const PreservedAnalyses &) { return false; } + + AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB); + bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal); + +private: + bool Aliases(const MDNode *A, const MDNode *B) const; + bool PathAliases(const MDNode *A, const MDNode *B) const; + + class ASAliasRulesTy { + public: + ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_); + AliasResult getAliasResult(unsigned AS1, unsigned AS2) const; + private: + Triple::ArchType Arch; + AMDGPUAS AS; + const AliasResult (*ASAliasRules)[6][6]; + } ASAliasRules; +}; + +/// Analysis pass providing a never-invalidated alias analysis result. +class AMDGPUAA : public AnalysisInfoMixin<AMDGPUAA> { + friend AnalysisInfoMixin<AMDGPUAA>; + static char PassID; + +public: + typedef AMDGPUAAResult Result; + + AMDGPUAAResult run(Function &F, AnalysisManager<Function> &AM) { + return AMDGPUAAResult(F.getParent()->getDataLayout(), + Triple(F.getParent()->getTargetTriple())); + } +}; + +/// Legacy wrapper pass to provide the AMDGPUAAResult object. +class AMDGPUAAWrapperPass : public ImmutablePass { + std::unique_ptr<AMDGPUAAResult> Result; + +public: + static char ID; + + AMDGPUAAWrapperPass() : ImmutablePass(ID) { + initializeAMDGPUAAWrapperPassPass(*PassRegistry::getPassRegistry()); + } + + AMDGPUAAResult &getResult() { return *Result; } + const AMDGPUAAResult &getResult() const { return *Result; } + + bool doInitialization(Module &M) override { + Result.reset(new AMDGPUAAResult(M.getDataLayout(), + Triple(M.getTargetTriple()))); + return false; + } + bool doFinalization(Module &M) override { + Result.reset(); + return false; + } + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +} +#endif // LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index 067a16a..6f3742e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -9,7 +9,7 @@ // /// \file /// This pass marks all internal functions as always_inline and creates -/// duplicates of all other functions a marks the duplicates as always_inline. +/// duplicates of all other functions and marks the duplicates as always_inline. // //===----------------------------------------------------------------------===// @@ -22,16 +22,22 @@ using namespace llvm; namespace { class AMDGPUAlwaysInline : public ModulePass { - static char ID; + bool GlobalOpt; public: - AMDGPUAlwaysInline() : ModulePass(ID) { } + static char ID; + + AMDGPUAlwaysInline(bool GlobalOpt = false) : + ModulePass(ID), GlobalOpt(GlobalOpt) { } bool runOnModule(Module &M) override; StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; } }; } // End anonymous namespace +INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline", + "AMDGPU Inline All Functions", false, false) + char AMDGPUAlwaysInline::ID = 0; bool AMDGPUAlwaysInline::runOnModule(Module &M) { @@ -45,8 +51,10 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { } } - for (GlobalAlias* A : AliasesToRemove) { - A->eraseFromParent(); + if (GlobalOpt) { + for (GlobalAlias* A : AliasesToRemove) { + A->eraseFromParent(); + } } for (Function &F : M) { @@ -70,6 +78,6 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { return false; } -ModulePass *llvm::createAMDGPUAlwaysInlinePass() { - return new AMDGPUAlwaysInline(); +ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) { + return new AMDGPUAlwaysInline(GlobalOpt); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index c98d25e2..c68e586 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -13,8 +13,12 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -24,31 +28,34 @@ using namespace llvm; namespace { -class AMDGPUAnnotateKernelFeatures : public ModulePass { +class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: - static bool hasAddrSpaceCast(const Function &F); + const TargetMachine *TM = nullptr; + AMDGPUAS AS; - void addAttrToCallers(Function *Intrin, StringRef AttrName); - bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>); + bool addFeatureAttributes(Function &F); public: static char ID; - AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { } - bool runOnModule(Module &M) override; + AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} + + bool doInitialization(CallGraph &CG) override; + bool runOnSCC(CallGraphSCC &SCC) override; StringRef getPassName() const override { return "AMDGPU Annotate Kernel Features"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); - ModulePass::getAnalysisUsage(AU); + CallGraphSCCPass::getAnalysisUsage(AU); } - static bool visitConstantExpr(const ConstantExpr *CE); + static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS); static bool visitConstantExprsRecursively( const Constant *EntryC, - SmallPtrSet<const Constant *, 8> &ConstantExprVisited); + SmallPtrSet<const Constant *, 8> &ConstantExprVisited, + AMDGPUAS AS); }; } @@ -62,18 +69,20 @@ INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, // The queue ptr is only needed when casting to flat, not from it. -static bool castRequiresQueuePtr(unsigned SrcAS) { - return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; +static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) { + return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS; } -static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { - return castRequiresQueuePtr(ASC->getSrcAddressSpace()); +static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC, + const AMDGPUAS &AS) { + return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS); } -bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { +bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE, + AMDGPUAS AS) { if (CE->getOpcode() == Instruction::AddrSpaceCast) { unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); - return castRequiresQueuePtr(SrcAS); + return castRequiresQueuePtr(SrcAS, AS); } return false; @@ -81,7 +90,8 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( const Constant *EntryC, - SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { + SmallPtrSet<const Constant *, 8> &ConstantExprVisited, + AMDGPUAS AS) { if (!ConstantExprVisited.insert(EntryC).second) return false; @@ -94,7 +104,7 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( // Check this constant expression. if (const auto *CE = dyn_cast<ConstantExpr>(C)) { - if (visitConstantExpr(CE)) + if (visitConstantExpr(CE, AS)) return true; } @@ -114,15 +124,130 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( return false; } -// Return true if an addrspacecast is used that requires the queue ptr. -bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) { +// We do not need to note the x workitem or workgroup id because they are always +// initialized. +// +// TODO: We should not add the attributes if the known compile time workgroup +// size is 1 for y/z. +static StringRef intrinsicToAttrName(Intrinsic::ID ID, + bool &NonKernelOnly, + bool &IsQueuePtr) { + switch (ID) { + case Intrinsic::amdgcn_workitem_id_x: + NonKernelOnly = true; + return "amdgpu-work-item-id-x"; + case Intrinsic::amdgcn_workgroup_id_x: + NonKernelOnly = true; + return "amdgpu-work-group-id-x"; + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + return "amdgpu-work-item-id-y"; + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + return "amdgpu-work-item-id-z"; + case Intrinsic::amdgcn_workgroup_id_y: + case Intrinsic::r600_read_tgid_y: + return "amdgpu-work-group-id-y"; + case Intrinsic::amdgcn_workgroup_id_z: + case Intrinsic::r600_read_tgid_z: + return "amdgpu-work-group-id-z"; + case Intrinsic::amdgcn_dispatch_ptr: + return "amdgpu-dispatch-ptr"; + case Intrinsic::amdgcn_dispatch_id: + return "amdgpu-dispatch-id"; + case Intrinsic::amdgcn_kernarg_segment_ptr: + case Intrinsic::amdgcn_implicitarg_ptr: + return "amdgpu-kernarg-segment-ptr"; + case Intrinsic::amdgcn_queue_ptr: + case Intrinsic::trap: + case Intrinsic::debugtrap: + IsQueuePtr = true; + return "amdgpu-queue-ptr"; + default: + return ""; + } +} + +static bool handleAttr(Function &Parent, const Function &Callee, + StringRef Name) { + if (Callee.hasFnAttribute(Name)) { + Parent.addFnAttr(Name); + return true; + } + + return false; +} + +static void copyFeaturesToFunction(Function &Parent, const Function &Callee, + bool &NeedQueuePtr) { + // X ids unnecessarily propagated to kernels. + static const StringRef AttrNames[] = { + { "amdgpu-work-item-id-x" }, + { "amdgpu-work-item-id-y" }, + { "amdgpu-work-item-id-z" }, + { "amdgpu-work-group-id-x" }, + { "amdgpu-work-group-id-y" }, + { "amdgpu-work-group-id-z" }, + { "amdgpu-dispatch-ptr" }, + { "amdgpu-dispatch-id" }, + { "amdgpu-kernarg-segment-ptr" } + }; + + if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) + NeedQueuePtr = true; + + for (StringRef AttrName : AttrNames) + handleAttr(Parent, Callee, AttrName); +} + +bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + bool HasFlat = ST.hasFlatAddressSpace(); + bool HasApertureRegs = ST.hasApertureRegs(); SmallPtrSet<const Constant *, 8> ConstantExprVisited; - for (const BasicBlock &BB : F) { - for (const Instruction &I : BB) { + bool Changed = false; + bool NeedQueuePtr = false; + bool HaveCall = false; + bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); + + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + CallSite CS(&I); + if (CS) { + Function *Callee = CS.getCalledFunction(); + + // TODO: Do something with indirect calls. + if (!Callee) { + if (!CS.isInlineAsm()) + HaveCall = true; + continue; + } + + Intrinsic::ID IID = Callee->getIntrinsicID(); + if (IID == Intrinsic::not_intrinsic) { + HaveCall = true; + copyFeaturesToFunction(F, *Callee, NeedQueuePtr); + Changed = true; + } else { + bool NonKernelOnly = false; + StringRef AttrName = intrinsicToAttrName(IID, + NonKernelOnly, NeedQueuePtr); + if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { + F.addFnAttr(AttrName); + Changed = true; + } + } + } + + if (NeedQueuePtr || HasApertureRegs) + continue; + if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { - if (castRequiresQueuePtr(ASC)) - return true; + if (castRequiresQueuePtr(ASC, AS)) { + NeedQueuePtr = true; + continue; + } } for (const Use &U : I.operands()) { @@ -130,93 +255,57 @@ bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) { if (!OpC) continue; - if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) - return true; + if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) { + NeedQueuePtr = true; + break; + } } } } - return false; -} - -void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, - StringRef AttrName) { - SmallPtrSet<Function *, 4> SeenFuncs; - - for (User *U : Intrin->users()) { - // CallInst is the only valid user for an intrinsic. - CallInst *CI = cast<CallInst>(U); - - Function *CallingFunction = CI->getParent()->getParent(); - if (SeenFuncs.insert(CallingFunction).second) - CallingFunction->addFnAttr(AttrName); + if (NeedQueuePtr) { + F.addFnAttr("amdgpu-queue-ptr"); + Changed = true; } -} - -bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics( - Module &M, - ArrayRef<StringRef[2]> IntrinsicToAttr) { - bool Changed = false; - for (const StringRef *Arr : IntrinsicToAttr) { - if (Function *Fn = M.getFunction(Arr[0])) { - addAttrToCallers(Fn, Arr[1]); - Changed = true; - } + // TODO: We could refine this to captured pointers that could possibly be + // accessed by flat instructions. For now this is mostly a poor way of + // estimating whether there are calls before argument lowering. + if (HasFlat && !IsFunc && HaveCall) { + F.addFnAttr("amdgpu-flat-scratch"); + Changed = true; } return Changed; } -bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { +bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { + Module &M = SCC.getCallGraph().getModule(); Triple TT(M.getTargetTriple()); - static const StringRef IntrinsicToAttr[][2] = { - // .x omitted - { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" }, - { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" }, - - { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" }, - { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" }, - - { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, - { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, - - // .x omitted - { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" }, - { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" } - }; - - static const StringRef HSAIntrinsicToAttr[][2] = { - { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" }, - { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" }, - { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" } - }; - - // TODO: We should not add the attributes if the known compile time workgroup - // size is 1 for y/z. - - // TODO: Intrinsics that require queue ptr. + bool Changed = false; + for (CallGraphNode *I : SCC) { + Function *F = I->getFunction(); + if (!F || F->isDeclaration()) + continue; - // We do not need to note the x workitem or workgroup id because they are - // always initialized. + Changed |= addFeatureAttributes(*F); + } - bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); - if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) { - Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); - for (Function &F : M) { - if (F.hasFnAttribute("amdgpu-queue-ptr")) - continue; + return Changed; +} - if (hasAddrSpaceCast(F)) - F.addFnAttr("amdgpu-queue-ptr"); - } - } +bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + report_fatal_error("TargetMachine is required"); - return Changed; + AS = AMDGPU::getAMDGPUAS(CG.getModule()); + TM = &TPC->getTM<TargetMachine>(); + return false; } -ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { +Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { return new AMDGPUAnnotateKernelFeatures(); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index c011be6..ed53708 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -19,8 +19,8 @@ #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" -#include "llvm/IR/InstVisitor.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -37,6 +37,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass, LoopInfo *LI; DenseMap<Value*, GetElementPtrInst*> noClobberClones; bool isKernelFunc; + AMDGPUAS AMDGPUASI; public: static char ID; @@ -106,11 +107,12 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) { DFS(Start, Checklist); for (auto &BB : Checklist) { - BasicBlock::iterator StartIt = (BB == Load->getParent()) ? - BasicBlock::iterator(Load) : BB->end(); - if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr), - true, StartIt, BB, Load).isClobber()) - return true; + BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ? + BasicBlock::iterator(Load) : BB->end(); + auto Q = MDR->getPointerDependencyFrom(MemoryLocation(Ptr), true, + StartIt, BB, Load); + if (Q.isClobber() || Q.isUnknown()) + return true; } return false; } @@ -130,8 +132,8 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Ptr = I.getPointerOperand(); if (!DA->isUniform(Ptr)) return; - auto isGlobalLoad = [](LoadInst &Load)->bool { - return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; + auto isGlobalLoad = [&](LoadInst &Load)->bool { + return Load.getPointerAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; }; // We're tracking up to the Function boundaries // We cannot go beyond because of FunctionPass restrictions @@ -166,6 +168,7 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { } bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { + AMDGPUASI = AMDGPU::getAMDGPUAS(M); return false; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 974e79f..2247814 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -17,25 +17,25 @@ // #include "AMDGPUAsmPrinter.h" -#include "MCTargetDesc/AMDGPUTargetStreamer.h" -#include "InstPrinter/AMDGPUInstPrinter.h" -#include "Utils/AMDGPUBaseInfo.h" #include "AMDGPU.h" -#include "AMDKernelCodeT.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" +#include "InstPrinter/AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "R600Defines.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" #include "SIDefines.h" -#include "SIMachineFunctionInfo.h" #include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/Support/ELF.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -93,33 +93,40 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() { AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) - : AsmPrinter(TM, std::move(Streamer)) {} + : AsmPrinter(TM, std::move(Streamer)) { + AMDGPUASI = static_cast<AMDGPUTargetMachine*>(&TM)->getAMDGPUAS(); + } StringRef AMDGPUAsmPrinter::getPassName() const { return "AMDGPU Assembly Printer"; } +const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const { + return TM.getMCSubtargetInfo(); +} + +AMDGPUTargetStreamer& AMDGPUAsmPrinter::getTargetStreamer() const { + return static_cast<AMDGPUTargetStreamer&>(*OutStreamer->getTargetStreamer()); +} + void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { if (TM.getTargetTriple().getOS() != Triple::AMDHSA) return; - // Need to construct an MCSubtargetInfo here in case we have no functions - // in the module. - std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo( - TM.getTargetTriple().str(), TM.getTargetCPU(), - TM.getTargetFeatureString())); - - AMDGPUTargetStreamer *TS = - static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getSTI()->getFeatureBits()); - TS->EmitDirectiveHSACodeObjectVersion(2, 1); + getTargetStreamer().EmitDirectiveHSACodeObjectVersion(2, 1); + getTargetStreamer().EmitDirectiveHSACodeObjectISA( + ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); + getTargetStreamer().EmitStartOfCodeObjectMetadata(M); +} - AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits()); - TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, - "AMD", "AMDGPU"); +void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) + return; - // Emit runtime metadata. - TS->EmitRuntimeMetadata(M); + getTargetStreamer().EmitEndOfCodeObjectMetadata(); } bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( @@ -136,25 +143,34 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); } - void AMDGPUAsmPrinter::EmitFunctionBodyStart() { + const AMDGPUMachineFunction *MFI = MF->getInfo<AMDGPUMachineFunction>(); + if (!MFI->isEntryFunction()) + return; + const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); - SIProgramInfo KernelInfo; + amd_kernel_code_t KernelCode; if (STM.isAmdCodeObjectV2(*MF)) { - getSIProgramInfo(KernelInfo, *MF); - EmitAmdKernelCodeT(*MF, KernelInfo); + getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); + + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + getTargetStreamer().EmitAMDKernelCodeT(KernelCode); } + + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) + return; + getTargetStreamer().EmitKernelCodeObjectMetadata(*MF->getFunction(), + KernelCode); } void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); - if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) { - AMDGPUTargetStreamer *TS = - static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) { SmallString<128> SymbolName; getNameWithPrefix(SymbolName, MF->getFunction()), - TS->EmitAMDGPUSymbolType(SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); + getTargetStreamer().EmitAMDGPUSymbolType( + SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); } AsmPrinter::EmitFunctionEntryLabel(); @@ -163,16 +179,37 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { // Group segment variables aren't emitted in HSA. - if (AMDGPU::isGroupSegment(GV)) + if (AMDGPU::isGroupSegment(GV, AMDGPUASI)) return; AsmPrinter::EmitGlobalVariable(GV); } +bool AMDGPUAsmPrinter::doFinalization(Module &M) { + CallGraphResourceInfo.clear(); + return AsmPrinter::doFinalization(M); +} + +// Print comments that apply to both callable functions and entry points. +void AMDGPUAsmPrinter::emitCommonFunctionComments( + uint32_t NumVGPR, + uint32_t NumSGPR, + uint32_t ScratchSize, + uint64_t CodeSize) { + OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); + OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); + OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); + OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); +} + bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + CurrentProgramInfo = SIProgramInfo(); + + const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); // The starting address of all shader programs must be 256 bytes aligned. - MF.setAlignment(8); + // Regular functions just need the basic required instruction alignment. + MF.setAlignment(MFI->isEntryFunction() ? 8 : 2); SetupMachineFunction(MF); @@ -184,11 +221,19 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->SwitchSection(ConfigSection); } - SIProgramInfo KernelInfo; if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - getSIProgramInfo(KernelInfo, MF); + if (MFI->isEntryFunction()) { + getSIProgramInfo(CurrentProgramInfo, MF); + } else { + auto I = CallGraphResourceInfo.insert( + std::make_pair(MF.getFunction(), SIFunctionResourceInfo())); + SIFunctionResourceInfo &Info = I.first->second; + assert(I.second && "should only be called once per function"); + Info = analyzeResourceUsage(MF); + } + if (!STM.isAmdHsaOS()) { - EmitProgramInfoSI(MF, KernelInfo); + EmitProgramInfoSI(MF, CurrentProgramInfo); } } else { EmitProgramInfoR600(MF); @@ -206,60 +251,78 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->SwitchSection(CommentSection); if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (!MFI->isEntryFunction()) { + OutStreamer->emitRawComment(" Function info:", false); + SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()]; + emitCommonFunctionComments( + Info.NumVGPR, + Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()), + Info.PrivateSegmentSize, + getFunctionCodeSize(MF)); + return false; + } + OutStreamer->emitRawComment(" Kernel info:", false); - OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), - false); - OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), - false); - OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), - false); - OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), - false); - OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), - false); - OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), - false); - OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) + - " bytes/workgroup (compile time only)", false); - - OutStreamer->emitRawComment(" SGPRBlocks: " + - Twine(KernelInfo.SGPRBlocks), false); - OutStreamer->emitRawComment(" VGPRBlocks: " + - Twine(KernelInfo.VGPRBlocks), false); - - OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " + - Twine(KernelInfo.NumSGPRsForWavesPerEU), false); - OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " + - Twine(KernelInfo.NumVGPRsForWavesPerEU), false); - - OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst), - false); - OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount), - false); + emitCommonFunctionComments(CurrentProgramInfo.NumVGPR, + CurrentProgramInfo.NumSGPR, + CurrentProgramInfo.ScratchSize, + getFunctionCodeSize(MF)); + + OutStreamer->emitRawComment( + " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); + OutStreamer->emitRawComment( + " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); + OutStreamer->emitRawComment( + " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + + " bytes/workgroup (compile time only)", false); + + OutStreamer->emitRawComment( + " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false); + OutStreamer->emitRawComment( + " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false); + + OutStreamer->emitRawComment( + " NumSGPRsForWavesPerEU: " + + Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false); + OutStreamer->emitRawComment( + " NumVGPRsForWavesPerEU: " + + Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); + + OutStreamer->emitRawComment( + " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst), + false); + OutStreamer->emitRawComment( + " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount), + false); if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) { - OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + - Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); - OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" + - Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false); + OutStreamer->emitRawComment( + " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + + Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); + OutStreamer->emitRawComment( + " DebuggerPrivateSegmentBufferSGPR: s" + + Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false); } - OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + - Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), - false); - OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + - Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)), - false); - OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + - Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)), - false); - OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + - Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)), - false); - OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + - Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)), - false); - + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:USER_SGPR: " + + Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + + Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TGID_X_EN: " + + Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TGID_Y_EN: " + + Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TGID_Z_EN: " + + Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + + Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)), + false); } else { R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); OutStreamer->emitRawComment( @@ -300,7 +363,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { const MachineOperand &MO = MI.getOperand(op_idx); if (!MO.isReg()) continue; - unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; + unsigned HWReg = RI->getHWRegIndex(MO.getReg()); // Register with value > 127 aren't GPR if (HWReg > 127) @@ -343,18 +406,12 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { } } -void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, - const MachineFunction &MF) const { +uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - uint64_t CodeSize = 0; - unsigned MaxSGPR = 0; - unsigned MaxVGPR = 0; - bool VCCUsed = false; - bool FlatUsed = false; - const SIRegisterInfo *RI = STM.getRegisterInfo(); const SIInstrInfo *TII = STM.getInstrInfo(); + uint64_t CodeSize = 0; + for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. @@ -363,196 +420,191 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (MI.isDebugValue()) continue; - if (isVerbose()) - CodeSize += TII->getInstSizeInBytes(MI); - - unsigned numOperands = MI.getNumOperands(); - for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - const MachineOperand &MO = MI.getOperand(op_idx); - unsigned width = 0; - bool isSGPR = false; - - if (!MO.isReg()) - continue; - - unsigned reg = MO.getReg(); - switch (reg) { - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SCC: - case AMDGPU::M0: - continue; - - case AMDGPU::VCC: - case AMDGPU::VCC_LO: - case AMDGPU::VCC_HI: - VCCUsed = true; - continue; + CodeSize += TII->getInstSizeInBytes(MI); + } + } - case AMDGPU::FLAT_SCR: - case AMDGPU::FLAT_SCR_LO: - case AMDGPU::FLAT_SCR_HI: - // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat - // instructions aren't used to access the scratch buffer. - if (MFI->hasFlatScratchInit()) - FlatUsed = true; - continue; + return CodeSize; +} - case AMDGPU::TBA: - case AMDGPU::TBA_LO: - case AMDGPU::TBA_HI: - case AMDGPU::TMA: - case AMDGPU::TMA_LO: - case AMDGPU::TMA_HI: - llvm_unreachable("trap handler registers should not be used"); - - default: - break; - } - - if (AMDGPU::SReg_32RegClass.contains(reg)) { - assert(!AMDGPU::TTMP_32RegClass.contains(reg) && - "trap handler registers should not be used"); - isSGPR = true; - width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { - isSGPR = false; - width = 1; - } else if (AMDGPU::SReg_64RegClass.contains(reg)) { - assert(!AMDGPU::TTMP_64RegClass.contains(reg) && - "trap handler registers should not be used"); - isSGPR = true; - width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(reg)) { - isSGPR = false; - width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(reg)) { - isSGPR = false; - width = 3; - } else if (AMDGPU::SReg_128RegClass.contains(reg)) { - isSGPR = true; - width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(reg)) { - isSGPR = false; - width = 4; - } else if (AMDGPU::SReg_256RegClass.contains(reg)) { - isSGPR = true; - width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(reg)) { - isSGPR = false; - width = 8; - } else if (AMDGPU::SReg_512RegClass.contains(reg)) { - isSGPR = true; - width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(reg)) { - isSGPR = false; - width = 16; - } else { - llvm_unreachable("Unknown register class"); - } - unsigned hwReg = RI->getEncodingValue(reg) & 0xff; - unsigned maxUsed = hwReg + width - 1; - if (isSGPR) { - MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; - } else { - MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; - } - } - } +static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, + const SIInstrInfo &TII, + unsigned Reg) { + for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { + if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) + return true; } - unsigned ExtraSGPRs = 0; + return false; +} +static unsigned getNumExtraSGPRs(const SISubtarget &ST, + bool VCCUsed, + bool FlatScrUsed) { + unsigned ExtraSGPRs = 0; if (VCCUsed) ExtraSGPRs = 2; - if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { - if (FlatUsed) + if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { + if (FlatScrUsed) ExtraSGPRs = 4; } else { - if (STM.isXNACKEnabled()) + if (ST.isXNACKEnabled()) ExtraSGPRs = 4; - if (FlatUsed) + if (FlatScrUsed) ExtraSGPRs = 6; } - // Record first reserved register and reserved register count fields, and - // update max register counts if "amdgpu-debugger-reserve-regs" attribute was - // requested. - ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0; - ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM); + return ExtraSGPRs; +} - // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and - // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" - // attribute was requested. - if (STM.debuggerEmitPrologue()) { - ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = - RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); - ProgInfo.DebuggerPrivateSegmentBufferSGPR = - RI->getHWRegIndex(MFI->getScratchRSrcReg()); +int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs( + const SISubtarget &ST) const { + return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch); +} + +AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( + const MachineFunction &MF) const { + SIFunctionResourceInfo Info; + + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || + MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); + + // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat + // instructions aren't used to access the scratch buffer. Inline assembly may + // need it though. + // + // If we only have implicit uses of flat_scr on flat instructions, it is not + // really needed. + if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && + (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { + Info.UsesFlatScratch = false; + } + + Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); + Info.PrivateSegmentSize = FrameInfo.getStackSize(); + + + Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || + MRI.isPhysRegUsed(AMDGPU::VCC_HI); + + // If there are no calls, MachineRegisterInfo can tell us the used register + // count easily. + + MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestVGPRReg = Reg; + break; + } } + MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestSGPRReg = Reg; + break; + } + } + + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestVGPRReg) + 1; + Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestSGPRReg) + 1; + + return Info; +} + +void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, + const MachineFunction &MF) { + SIFunctionResourceInfo Info = analyzeResourceUsage(MF); + + ProgInfo.NumVGPR = Info.NumVGPR; + ProgInfo.NumSGPR = Info.NumExplicitSGPR; + ProgInfo.ScratchSize = Info.PrivateSegmentSize; + ProgInfo.VCCUsed = Info.UsesVCC; + ProgInfo.FlatUsed = Info.UsesFlatScratch; + ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; + + const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const SIInstrInfo *TII = STM.getInstrInfo(); + const SIRegisterInfo *RI = &TII->getRegisterInfo(); + + unsigned ExtraSGPRs = getNumExtraSGPRs(STM, + ProgInfo.VCCUsed, + ProgInfo.FlatUsed); + unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF); + // Check the addressable register limit before we add ExtraSGPRs. if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && !STM.hasSGPRInitBug()) { - unsigned MaxAddressableNumSGPRs = STM.getMaxNumSGPRs(); - if (MaxSGPR + 1 > MaxAddressableNumSGPRs) { + unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); + if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { // This can happen due to a compiler bug or when using inline asm. LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "addressable scalar registers", - MaxSGPR + 1, DS_Error, - DK_ResourceLimit, MaxAddressableNumSGPRs); + ProgInfo.NumSGPR, DS_Error, + DK_ResourceLimit, + MaxAddressableNumSGPRs); Ctx.diagnose(Diag); - MaxSGPR = MaxAddressableNumSGPRs - 1; + ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1; } } // Account for extra SGPRs and VGPRs reserved for debugger use. - MaxSGPR += ExtraSGPRs; - MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM); - - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - ProgInfo.NumVGPR = MaxVGPR + 1; - ProgInfo.NumSGPR = MaxSGPR + 1; + ProgInfo.NumSGPR += ExtraSGPRs; + ProgInfo.NumVGPR += ExtraVGPRs; // Adjust number of registers used to meet default/requested minimum/maximum // number of waves per execution unit request. ProgInfo.NumSGPRsForWavesPerEU = std::max( - ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU())); + std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); ProgInfo.NumVGPRsForWavesPerEU = std::max( - ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU())); + std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || STM.hasSGPRInitBug()) { - unsigned MaxNumSGPRs = STM.getMaxNumSGPRs(); - if (ProgInfo.NumSGPR > MaxNumSGPRs) { - // This can happen due to a compiler bug or when using inline asm to use the - // registers which are usually reserved for vcc etc. - + unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); + if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { + // This can happen due to a compiler bug or when using inline asm to use + // the registers which are usually reserved for vcc etc. LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "scalar registers", ProgInfo.NumSGPR, DS_Error, - DK_ResourceLimit, MaxNumSGPRs); + DK_ResourceLimit, + MaxAddressableNumSGPRs); Ctx.diagnose(Diag); - ProgInfo.NumSGPR = MaxNumSGPRs; - ProgInfo.NumSGPRsForWavesPerEU = MaxNumSGPRs; + ProgInfo.NumSGPR = MaxAddressableNumSGPRs; + ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs; } } if (STM.hasSGPRInitBug()) { - ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; - ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + ProgInfo.NumSGPR = + AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + ProgInfo.NumSGPRsForWavesPerEU = + AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; } - if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { + if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs", - MFI->NumUserSGPRs, DS_Error); + MFI->getNumUserSGPRs(), DS_Error); Ctx.diagnose(Diag); } @@ -565,13 +617,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // SGPRBlocks is actual number of SGPR blocks minus 1. ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU, - RI->getSGPRAllocGranule()); - ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1; + STM.getSGPREncodingGranule()); + ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1; // VGPRBlocks is actual number of VGPR blocks minus 1. ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU, - RI->getVGPRAllocGranule()); - ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1; + STM.getVGPREncodingGranule()); + ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1; + + // Record first reserved VGPR and number of reserved VGPRs. + ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0; + ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF); + + // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and + // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" + // attribute was requested. + if (STM.debuggerEmitPrologue()) { + ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = + RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); + ProgInfo.DebuggerPrivateSegmentBufferSGPR = + RI->getHWRegIndex(MFI->getScratchRSrcReg()); + } // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode // register. @@ -580,14 +646,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.IEEEMode = STM.enableIEEEBit(MF); // Make clamp modifier on NaN input returns 0. - ProgInfo.DX10Clamp = 1; - - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - ProgInfo.ScratchSize = FrameInfo.getStackSize(); - - ProgInfo.FlatUsed = FlatUsed; - ProgInfo.VCCUsed = VCCUsed; - ProgInfo.CodeLen = CodeSize; + ProgInfo.DX10Clamp = STM.enableDX10Clamp(); unsigned LDSAlignShift; if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) { @@ -599,7 +658,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } unsigned LDSSpillSize = - MFI->LDSWaveSpillSize * MFI->getMaxFlatWorkGroupSize(); + MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize(); ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize; ProgInfo.LDSBlocks = @@ -635,13 +694,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.ComputePGMRSrc2 = S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | + S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) | S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | S_00B84C_EXCP_EN_MSB(0) | - S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) | + // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. + S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | S_00B84C_EXCP_EN(0); } @@ -649,6 +710,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) { switch (CallConv) { default: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; + case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; @@ -656,7 +718,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) { } void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) { + const SIProgramInfo &CurrentProgramInfo) { const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv()); @@ -664,31 +726,31 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); - OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); + OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4); OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); + OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4); OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); + OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = // 0" comment but I don't see a corresponding field in the register spec. } else { OutStreamer->EmitIntValue(RsrcReg, 4); - OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | - S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); + OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | + S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); if (STM.isVGPRSpillingEnabled(*MF.getFunction())) { OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); + OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); } } if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) { OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); + OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4); OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer->EmitIntValue(MFI->PSInputEna, 4); + OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); } @@ -713,97 +775,91 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { } } -void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) const { +void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, + const SIProgramInfo &CurrentProgramInfo, + const MachineFunction &MF) const { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); - amd_kernel_code_t header; - AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits()); + AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits()); - header.compute_pgm_resource_registers = - KernelInfo.ComputePGMRSrc1 | - (KernelInfo.ComputePGMRSrc2 << 32); - header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + Out.compute_pgm_resource_registers = + CurrentProgramInfo.ComputePGMRSrc1 | + (CurrentProgramInfo.ComputePGMRSrc2 << 32); + Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + if (CurrentProgramInfo.DynamicCallStack) + Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; - AMD_HSA_BITS_SET(header.code_properties, + AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, getElementByteSizeValue(STM.getMaxPrivateElementSize())); if (MFI->hasPrivateSegmentBuffer()) { - header.code_properties |= + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; } if (MFI->hasDispatchPtr()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; if (MFI->hasQueuePtr()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; if (MFI->hasKernargSegmentPtr()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; if (MFI->hasDispatchID()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; if (MFI->hasFlatScratchInit()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; - - // TODO: Private segment size + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; if (MFI->hasGridWorkgroupCountX()) { - header.code_properties |= + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; } if (MFI->hasGridWorkgroupCountY()) { - header.code_properties |= + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; } if (MFI->hasGridWorkgroupCountZ()) { - header.code_properties |= + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; } if (MFI->hasDispatchPtr()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; if (STM.debuggerSupported()) - header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; + Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; if (STM.isXNACKEnabled()) - header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; + Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; // FIXME: Should use getKernArgSize - header.kernarg_segment_byte_size = + Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset()); - header.wavefront_sgpr_count = KernelInfo.NumSGPR; - header.workitem_vgpr_count = KernelInfo.NumVGPR; - header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; - header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; - header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; - header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; + Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; + Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; + Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; + Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; + Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst; + Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount; // These alignment values are specified in powers of two, so alignment = // 2^n. The minimum alignment is 2^4 = 16. - header.kernarg_segment_alignment = std::max((size_t)4, + Out.kernarg_segment_alignment = std::max((size_t)4, countTrailingZeros(MFI->getMaxKernArgAlign())); if (STM.debuggerEmitPrologue()) { - header.debug_wavefront_private_segment_offset_sgpr = - KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; - header.debug_private_segment_buffer_sgpr = - KernelInfo.DebuggerPrivateSegmentBufferSGPR; + Out.debug_wavefront_private_segment_offset_sgpr = + CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; + Out.debug_private_segment_buffer_sgpr = + CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR; } - - AMDGPUTargetStreamer *TS = - static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); - - OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); - TS->EmitAMDKernelCodeT(header); } bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 9a4bafe..0a58ce0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -15,95 +15,110 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H -#include "AMDGPUMCInstLower.h" - +#include "AMDGPU.h" +#include "AMDKernelCodeT.h" +#include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/AsmPrinter.h" +#include <cstddef> +#include <cstdint> +#include <limits> +#include <memory> +#include <string> #include <vector> namespace llvm { + +class AMDGPUTargetStreamer; class MCOperand; +class SISubtarget; class AMDGPUAsmPrinter final : public AsmPrinter { private: - struct SIProgramInfo { - SIProgramInfo() : - VGPRBlocks(0), - SGPRBlocks(0), - Priority(0), - FloatMode(0), - Priv(0), - DX10Clamp(0), - DebugMode(0), - IEEEMode(0), - ScratchSize(0), - ComputePGMRSrc1(0), - LDSBlocks(0), - ScratchBlocks(0), - ComputePGMRSrc2(0), - NumVGPR(0), - NumSGPR(0), - FlatUsed(false), - NumSGPRsForWavesPerEU(0), - NumVGPRsForWavesPerEU(0), - ReservedVGPRFirst(0), - ReservedVGPRCount(0), - DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1), - DebuggerPrivateSegmentBufferSGPR((uint16_t)-1), - VCCUsed(false), - CodeLen(0) {} + // Track resource usage for callee functions. + struct SIFunctionResourceInfo { + // Track the number of explicitly used VGPRs. Special registers reserved at + // the end are tracked separately. + int32_t NumVGPR = 0; + int32_t NumExplicitSGPR = 0; + uint32_t PrivateSegmentSize = 0; + bool UsesVCC = false; + bool UsesFlatScratch = false; + bool HasDynamicallySizedStack = false; + bool HasRecursion = false; + + int32_t getTotalNumSGPRs(const SISubtarget &ST) const; + }; + // Track resource usage for kernels / entry functions. + struct SIProgramInfo { // Fields set in PGM_RSRC1 pm4 packet. - uint32_t VGPRBlocks; - uint32_t SGPRBlocks; - uint32_t Priority; - uint32_t FloatMode; - uint32_t Priv; - uint32_t DX10Clamp; - uint32_t DebugMode; - uint32_t IEEEMode; - uint32_t ScratchSize; - - uint64_t ComputePGMRSrc1; + uint32_t VGPRBlocks = 0; + uint32_t SGPRBlocks = 0; + uint32_t Priority = 0; + uint32_t FloatMode = 0; + uint32_t Priv = 0; + uint32_t DX10Clamp = 0; + uint32_t DebugMode = 0; + uint32_t IEEEMode = 0; + uint32_t ScratchSize = 0; + + uint64_t ComputePGMRSrc1 = 0; // Fields set in PGM_RSRC2 pm4 packet. - uint32_t LDSBlocks; - uint32_t ScratchBlocks; + uint32_t LDSBlocks = 0; + uint32_t ScratchBlocks = 0; - uint64_t ComputePGMRSrc2; + uint64_t ComputePGMRSrc2 = 0; - uint32_t NumVGPR; - uint32_t NumSGPR; - uint32_t LDSSize; - bool FlatUsed; + uint32_t NumVGPR = 0; + uint32_t NumSGPR = 0; + uint32_t LDSSize = 0; + bool FlatUsed = false; // Number of SGPRs that meets number of waves per execution unit request. - uint32_t NumSGPRsForWavesPerEU; + uint32_t NumSGPRsForWavesPerEU = 0; // Number of VGPRs that meets number of waves per execution unit request. - uint32_t NumVGPRsForWavesPerEU; + uint32_t NumVGPRsForWavesPerEU = 0; // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first // fixed VGPR number reserved. - uint16_t ReservedVGPRFirst; + uint16_t ReservedVGPRFirst = 0; // The number of consecutive VGPRs reserved. - uint16_t ReservedVGPRCount; + uint16_t ReservedVGPRCount = 0; // Fixed SGPR number used to hold wave scratch offset for entire kernel - // execution, or uint16_t(-1) if the register is not used or not known. - uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR; + // execution, or std::numeric_limits<uint16_t>::max() if the register is not + // used or not known. + uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR = + std::numeric_limits<uint16_t>::max(); // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire - // kernel execution, or uint16_t(-1) if the register is not used or not - // known. - uint16_t DebuggerPrivateSegmentBufferSGPR; + // kernel execution, or std::numeric_limits<uint16_t>::max() if the register + // is not used or not known. + uint16_t DebuggerPrivateSegmentBufferSGPR = + std::numeric_limits<uint16_t>::max(); + + // Whether there is recursion, dynamic allocas, indirect calls or some other + // reason there may be statically unknown stack usage. + bool DynamicCallStack = false; // Bonus information for debugging. - bool VCCUsed; - uint64_t CodeLen; + bool VCCUsed = false; + + SIProgramInfo() = default; }; - void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; + SIProgramInfo CurrentProgramInfo; + DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo; + + uint64_t getFunctionCodeSize(const MachineFunction &MF) const; + SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const; + + void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); + void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, + const MachineFunction &MF) const; void findNumUsedRegistersSI(const MachineFunction &MF, unsigned &NumSGPR, unsigned &NumVGPR) const; @@ -112,21 +127,33 @@ private: /// can correctly setup the GPU state. void EmitProgramInfoR600(const MachineFunction &MF); void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); - void EmitAmdKernelCodeT(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) const; + void emitCommonFunctionComments(uint32_t NumVGPR, + uint32_t NumSGPR, + uint32_t ScratchSize, + uint64_t CodeSize); public: explicit AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer); - bool runOnMachineFunction(MachineFunction &MF) override; - StringRef getPassName() const override; + const MCSubtargetInfo* getSTI() const; + + AMDGPUTargetStreamer& getTargetStreamer() const; + + bool doFinalization(Module &M) override; + bool runOnMachineFunction(MachineFunction &MF) override; + /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated /// pseudo lowering. bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const; + /// \brief Lower the specified LLVM Constant to an MCExpr. + /// The AsmPrinter::lowerConstantof does not know how to lower + /// addrspacecast, therefore they should be lowered by this function. + const MCExpr *lowerConstant(const Constant *CV) override; + /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo /// instructions. bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, @@ -143,6 +170,8 @@ public: void EmitStartOfAsmFile(Module &M) override; + void EmitEndOfAsmFile(Module &M) override; + bool isBlockOnlyReachableByFallthrough( const MachineBasicBlock *MBB) const override; @@ -153,8 +182,9 @@ public: protected: std::vector<std::string> DisasmLines, HexLines; size_t DisasmLineMaxLen; + AMDGPUAS AMDGPUASI; }; -} // End anonymous llvm +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index d53cc15..515cc07 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -14,8 +14,13 @@ //===----------------------------------------------------------------------===// #include "AMDGPUCallLowering.h" +#include "AMDGPU.h" #include "AMDGPUISelLowering.h" - +#include "AMDGPUSubtarget.h" +#include "SIISelLowering.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -26,17 +31,138 @@ using namespace llvm; #endif AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) - : CallLowering(&TLI) { + : CallLowering(&TLI), AMDGPUASI(TLI.getAMDGPUAS()) { } bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, - const Value *Val, unsigned VReg) const { + const Value *Val, unsigned VReg) const { + MIRBuilder.buildInstr(AMDGPU::S_ENDPGM); return true; } +unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, + Type *ParamTy, + unsigned Offset) const { + + MachineFunction &MF = MIRBuilder.getMF(); + const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = *MF.getFunction(); + const DataLayout &DL = F.getParent()->getDataLayout(); + PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS); + LLT PtrType = getLLTForType(*PtrTy, DL); + unsigned DstReg = MRI.createGenericVirtualRegister(PtrType); + unsigned KernArgSegmentPtr = + TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); + + unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + MIRBuilder.buildConstant(OffsetReg, Offset); + + MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg); + + return DstReg; +} + +void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, + Type *ParamTy, unsigned Offset, + unsigned DstReg) const { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = *MF.getFunction(); + const DataLayout &DL = F.getParent()->getDataLayout(); + PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + unsigned TypeSize = DL.getTypeStoreSize(ParamTy); + unsigned Align = DL.getABITypeAlignment(ParamTy); + unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); + + MachineMemOperand *MMO = + MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | + MachineMemOperand::MONonTemporal | + MachineMemOperand::MOInvariant, + TypeSize, Align); + + MIRBuilder.buildLoad(DstReg, PtrReg, *MMO); +} + bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const { - // TODO: Implement once there are generic loads/stores. + + MachineFunction &MF = MIRBuilder.getMF(); + const SISubtarget *Subtarget = static_cast<const SISubtarget *>(&MF.getSubtarget()); + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo(); + const DataLayout &DL = F.getParent()->getDataLayout(); + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); + + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info->hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } + + if (Info->hasDispatchPtr()) { + unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + // FIXME: Need to add reg as live-in + CCInfo.AllocateReg(DispatchPtrReg); + } + + if (Info->hasQueuePtr()) { + unsigned QueuePtrReg = Info->addQueuePtr(*TRI); + // FIXME: Need to add reg as live-in + CCInfo.AllocateReg(QueuePtrReg); + } + + if (Info->hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + const LLT P2 = LLT::pointer(2, 64); + unsigned VReg = MRI.createGenericVirtualRegister(P2); + MRI.addLiveIn(InputPtrReg, VReg); + MIRBuilder.getMBB().addLiveIn(InputPtrReg); + MIRBuilder.buildCopy(VReg, InputPtrReg); + CCInfo.AllocateReg(InputPtrReg); + } + + if (Info->hasDispatchID()) { + unsigned DispatchIDReg = Info->addDispatchID(*TRI); + // FIXME: Need to add reg as live-in + CCInfo.AllocateReg(DispatchIDReg); + } + + if (Info->hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + // FIXME: Need to add reg as live-in + CCInfo.AllocateReg(FlatScratchInitReg); + } + + unsigned NumArgs = F.arg_size(); + Function::const_arg_iterator CurOrigArg = F.arg_begin(); + const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); + for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { + MVT ValVT = TLI.getValueType(DL, CurOrigArg->getType()).getSimpleVT(); + ISD::ArgFlagsTy Flags; + Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); + CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(), + /*IsVarArg=*/false); + bool Res = + AssignFn(i, ValVT, ValVT, CCValAssign::Full, Flags, CCInfo); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + } + + Function::const_arg_iterator Arg = F.arg_begin(); + for (unsigned i = 0; i != NumArgs; ++i, ++Arg) { + // FIXME: We should be getting DebugInfo from the arguments some how. + CCValAssign &VA = ArgLocs[i]; + lowerParameter(MIRBuilder, Arg->getType(), + VA.getLocMemOffset() + + Subtarget->getExplicitKernelArgOffset(MF), VRegs[i]); + } + return true; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h index 9ae87c9..251cb7a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H +#include "AMDGPU.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" namespace llvm { @@ -22,6 +23,14 @@ namespace llvm { class AMDGPUTargetLowering; class AMDGPUCallLowering: public CallLowering { + AMDGPUAS AMDGPUASI; + + unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, + unsigned Offset) const; + + void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy, + unsigned Offset, unsigned DstReg) const; + public: AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); @@ -29,6 +38,8 @@ class AMDGPUCallLowering: public CallLowering { unsigned VReg) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const override; + static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); + static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); }; } // End of namespace llvm; #endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 47dfa49..4bef7a8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -13,11 +13,13 @@ // Inversion of CCIfInReg class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {} +class CCIfExtend<CCAction A> + : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>; // Calling convention for SI def CC_SI : CallingConv<[ - CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[ + CCIfInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, @@ -25,17 +27,13 @@ def CC_SI : CallingConv<[ SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 ]>>>, - CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow< - [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14, - SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30, - SGPR32, SGPR34, SGPR36, SGPR38 ], - [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15, - SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31, - SGPR33, SGPR35, SGPR37, SGPR39 ] - >>>, + // We have no way of referring to the generated register tuples + // here, so use a custom function. + CCIfInReg<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>, + CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>, // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. - CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[ + CCIfNotInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -53,20 +51,10 @@ def CC_SI : CallingConv<[ VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119, VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127, VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135 - ]>>>, - - CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow< - [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14, - SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30, - SGPR32, SGPR34, SGPR36, SGPR38 ], - [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15, - SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31, - SGPR33, SGPR35, SGPR37, SGPR39 ] - >>> - + ]>>> ]>; -def RetCC_SI : CallingConv<[ +def RetCC_SI_Shader : CallingConv<[ CCIfType<[i32] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, @@ -76,7 +64,7 @@ def RetCC_SI : CallingConv<[ ]>>, // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. - CCIfType<[f32] , CCAssignToReg<[ + CCIfType<[f32, f16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -113,6 +101,52 @@ def CC_AMDGPU_Kernel : CallingConv<[ CCCustom<"allocateKernArg"> ]>; +def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs< + (sequence "VGPR%u", 24, 255) +>; + +def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs< + (sequence "VGPR%u", 32, 255) +>; + +def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs< + (sequence "SGPR%u", 32, 103) +>; + +def CSR_AMDGPU_HighRegs : CalleeSavedRegs< + (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103) +>; + +// Calling convention for leaf functions +def CC_AMDGPU_Func : CallingConv<[ + CCIfByVal<CCPassByVal<4, 4>>, + CCIfType<[i1], CCPromoteToType<i32>>, + CCIfType<[i1, i8, i16], CCIfExtend<CCPromoteToType<i32>>>, + CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, + CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>, + CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, + CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, + CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>, + CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>> +]>; + +// Calling convention for leaf functions +def RetCC_AMDGPU_Func : CallingConv<[ + CCIfType<[i1], CCPromoteToType<i32>>, + CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>, + CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, + CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">> +]>; + def CC_AMDGPU : CallingConv<[ CCIf<"static_cast<const AMDGPUSubtarget&>" "(State.getMachineFunction().getSubtarget()).getGeneration() >=" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index e623054..31ee920 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -14,16 +14,32 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" - +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/IR/InstVisitor.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include <cassert> +#include <iterator> #define DEBUG_TYPE "amdgpu-codegenprepare" @@ -33,18 +49,15 @@ namespace { class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor<AMDGPUCodeGenPrepare, bool> { - const GCNTargetMachine *TM; - const SISubtarget *ST; - DivergenceAnalysis *DA; - Module *Mod; - bool HasUnsafeFPMath; + const SISubtarget *ST = nullptr; + DivergenceAnalysis *DA = nullptr; + Module *Mod = nullptr; + bool HasUnsafeFPMath = false; /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. /// /// \returns Binary operation \p V. - Value *copyFlags(const BinaryOperator &I, Value *V) const; - /// \returns \p T's base element bit width. unsigned getBaseElementBitWidth(const Type *T) const; @@ -113,13 +126,8 @@ class AMDGPUCodeGenPrepare : public FunctionPass, public: static char ID; - AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : - FunctionPass(ID), - TM(static_cast<const GCNTargetMachine *>(TM)), - ST(nullptr), - DA(nullptr), - Mod(nullptr), - HasUnsafeFPMath(false) { } + + AMDGPUCodeGenPrepare() : FunctionPass(ID) {} bool visitFDiv(BinaryOperator &I); @@ -142,22 +150,7 @@ public: } }; -} // End anonymous namespace - -Value *AMDGPUCodeGenPrepare::copyFlags( - const BinaryOperator &I, Value *V) const { - BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V); - if (!BinOp) // Possibly constant expression. - return V; - - if (isa<OverflowingBinaryOperator>(BinOp)) { - BinOp->setHasNoSignedWrap(I.hasNoSignedWrap()); - BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); - } else if (isa<PossiblyExactOperator>(BinOp)) - BinOp->setIsExact(I.isExact()); - - return V; -} +} // end anonymous namespace unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { assert(needsPromotionToI32(T) && "T does not need promotion to i32"); @@ -186,12 +179,48 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { } bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { - if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 && - T->getIntegerBitWidth() <= 16) + const IntegerType *IntTy = dyn_cast<IntegerType>(T); + if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) + return true; + + if (const VectorType *VT = dyn_cast<VectorType>(T)) { + // TODO: The set of packed operations is more limited, so may want to + // promote some anyway. + if (ST->hasVOP3PInsts()) + return false; + + return needsPromotionToI32(VT->getElementType()); + } + + return false; +} + +// Return true if the op promoted to i32 should have nsw set. +static bool promotedOpIsNSW(const Instruction &I) { + switch (I.getOpcode()) { + case Instruction::Shl: + case Instruction::Add: + case Instruction::Sub: return true; - if (!T->isVectorTy()) + case Instruction::Mul: + return I.hasNoUnsignedWrap(); + default: + return false; + } +} + +// Return true if the op promoted to i32 should have nuw set. +static bool promotedOpIsNUW(const Instruction &I) { + switch (I.getOpcode()) { + case Instruction::Shl: + case Instruction::Add: + case Instruction::Mul: + return true; + case Instruction::Sub: + return I.hasNoUnsignedWrap(); + default: return false; - return needsPromotionToI32(cast<VectorType>(T)->getElementType()); + } } bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { @@ -218,7 +247,19 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); } - ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); + + ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); + if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { + if (promotedOpIsNSW(cast<Instruction>(I))) + Inst->setHasNoSignedWrap(); + + if (promotedOpIsNUW(cast<Instruction>(I))) + Inst->setHasNoUnsignedWrap(); + + if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) + Inst->setIsExact(ExactOp->isExact()); + } + TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); I.replaceAllUsesWith(TruncRes); @@ -339,16 +380,16 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { FastMathFlags FMF = FPOp->getFastMathFlags(); bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || FMF.allowReciprocal(); - if (ST->hasFP32Denormals() && !UnsafeDiv) + + // With UnsafeDiv node will be optimized to just rcp and mul. + if (ST->hasFP32Denormals() || UnsafeDiv) return false; IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); Builder.setFastMathFlags(FMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); - const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); - Function *Decl - = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); + Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); @@ -447,10 +488,15 @@ bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { } bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { - if (!TM || skipFunction(F)) + if (skipFunction(F)) + return false; + + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) return false; - ST = &TM->getSubtarget<SISubtarget>(F); + const TargetMachine &TM = TPC->getTM<TargetMachine>(); + ST = &TM.getSubtarget<SISubtarget>(F); DA = &getAnalysis<DivergenceAnalysis>(); HasUnsafeFPMath = hasUnsafeFPMath(F); @@ -467,14 +513,14 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { return MadeChange; } -INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) -INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, - "AMDGPU IR optimizations", false, false) +INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", + false, false) char AMDGPUCodeGenPrepare::ID = 0; -FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { - return new AMDGPUCodeGenPrepare(TM); +FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { + return new AMDGPUCodeGenPrepare(); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp index 805fb71..e32ca96 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -12,11 +12,6 @@ //===----------------------------------------------------------------------===// #include "AMDGPUFrameLowering.h" -#include "AMDGPURegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/Support/MathExtras.h" using namespace llvm; AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, @@ -69,34 +64,3 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { // T1.W = stack[1].w return 1; } - -/// \returns The number of registers allocated for \p FI. -int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF, - int FI, - unsigned &FrameReg) const { - const MachineFrameInfo &MFI = MF.getFrameInfo(); - const AMDGPURegisterInfo *RI - = MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo(); - - // Fill in FrameReg output argument. - FrameReg = RI->getFrameRegister(MF); - - // Start the offset at 2 so we don't overwrite work group information. - // XXX: We should only do this when the shader actually uses this - // information. - unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4); - int UpperBound = FI == -1 ? MFI.getNumObjects() : FI; - - for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) { - OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i)); - OffsetBytes += MFI.getObjectSize(i); - // Each register holds 4 bytes, so we must always align the offset to at - // least 4 bytes, so that 2 frame objects won't share the same register. - OffsetBytes = alignTo(OffsetBytes, 4); - } - - if (FI != -1) - OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI)); - - return OffsetBytes / (getStackWidth(MF) * 4); -} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 5d51351..8e187c7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -34,9 +34,6 @@ public: /// values to the stack. unsigned getStackWidth(const MachineFunction &MF) const; - int getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const override; - bool hasFP(const MachineFunction &MF) const override { return false; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/contrib/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def new file mode 100644 index 0000000..5cb9036 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -0,0 +1,62 @@ +//===- AMDGPUGenRegisterBankInfo.def -----------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file defines all the static objects used by AMDGPURegisterBankInfo. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +namespace llvm { +namespace AMDGPU { + +enum PartialMappingIdx { + None = - 1, + PM_SGPR32 = 0, + PM_SGPR64 = 1, + PM_VGPR32 = 2, + PM_VGPR64 = 3 +}; + +const RegisterBankInfo::PartialMapping PartMappings[] { + // StartIdx, Length, RegBank + {0, 32, SGPRRegBank}, + {0, 64, SGPRRegBank}, + {0, 32, VGPRRegBank}, + {0, 64, VGPRRegBank} +}; + +const RegisterBankInfo::ValueMapping ValMappings[] { + // SGPR 32-bit + {&PartMappings[0], 1}, + // SGPR 64-bit + {&PartMappings[1], 1}, + // VGPR 32-bit + {&PartMappings[2], 1}, + // VGPR 64-bit + {&PartMappings[3], 1} +}; + +enum ValueMappingIdx { + SGPRStartIdx = 0, + VGPRStartIdx = 2 +}; + +const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID, + unsigned Size) { + assert(Size % 32 == 0); + unsigned Idx = BankID == AMDGPU::SGPRRegBankID ? SGPRStartIdx : VGPRStartIdx; + Idx += (Size / 32) - 1; + return &ValMappings[Idx]; +} + +} // End AMDGPU namespace. +} // End llvm namespace. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 5bf347e..f235313 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -13,15 +13,15 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPUInstrInfo.h" #include "AMDGPURegisterInfo.h" -#include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPUSubtarget.h" #include "SIDefines.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" #include "SIISelLowering.h" +#include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" @@ -67,10 +67,13 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can // make the right decision when generating code for different targets. const AMDGPUSubtarget *Subtarget; + AMDGPUAS AMDGPUASI; public: explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(TM, OptLevel) {} + : SelectionDAGISel(TM, OptLevel){ + AMDGPUASI = AMDGPU::getAMDGPUAS(TM); + } ~AMDGPUDAGToDAGISel() override = default; bool runOnMachineFunction(MachineFunction &MF) override; @@ -79,7 +82,8 @@ public: void PostprocessISelDAG() override; private: - SDValue foldFrameIndex(SDValue N) const; + std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; + bool isNoNanSrc(SDValue N) const; bool isInlineImmediate(const SDNode *N) const; bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, const R600InstrInfo *TII); @@ -112,8 +116,13 @@ private: bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; - bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, - SDValue &SOffset, SDValue &ImmOffset) const; + bool SelectMUBUFScratchOffen(SDNode *Root, + SDValue Addr, SDValue &RSrc, SDValue &VAddr, + SDValue &SOffset, SDValue &ImmOffset) const; + bool SelectMUBUFScratchOffset(SDNode *Root, + SDValue Addr, SDValue &SRsrc, SDValue &Soffset, + SDValue &Offset) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; @@ -129,8 +138,10 @@ private: bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset, SDValue &ImmOffset, SDValue &VOffset) const; - bool SelectFlat(SDValue Addr, SDValue &VAddr, - SDValue &SLC, SDValue &TFE) const; + bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr, + SDValue &Offset, SDValue &SLC) const; + bool SelectFlatOffset(SDValue Addr, SDValue &VAddr, + SDValue &Offset, SDValue &SLC) const; bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; @@ -143,20 +154,28 @@ private: bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; + + bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; - bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, - SDValue &Omod) const; bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; + bool SelectVOP3OMods(SDValue In, SDValue &Src, + SDValue &Clamp, SDValue &Omod) const; + + bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp) const; + void SelectADD_SUB_I64(SDNode *N); + void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); void SelectFMA_W_CHAIN(SDNode *N); void SelectFMUL_W_CHAIN(SDNode *N); @@ -187,6 +206,17 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { return SelectionDAGISel::runOnMachineFunction(MF); } +bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { + if (TM.Options.NoNaNsFPMath) + return true; + + // TODO: Move into isKnownNeverNaN + if (N->getFlags().isDefined()) + return N->getFlags().hasNoNaNs(); + + return CurDAG->isKnownNeverNaN(N); +} + bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { const SIInstrInfo *TII = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo(); @@ -250,7 +280,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS) return N; const SITargetLowering& Lowering = @@ -290,6 +320,20 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { llvm_unreachable("invalid vector size"); } +static bool getConstantValue(SDValue N, uint32_t &Out) { + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { + Out = C->getAPIntValue().getZExtValue(); + return true; + } + + if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) { + Out = C->getValueAPF().bitcastToAPInt().getZExtValue(); + return true; + } + + return false; +} + void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -319,6 +363,11 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectADD_SUB_I64(N); return; } + case ISD::UADDO: + case ISD::USUBO: { + SelectUADDO_USUBO(N); + return; + } case AMDGPUISD::FMUL_W_CHAIN: { SelectFMUL_W_CHAIN(N); return; @@ -336,7 +385,24 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); + + if (VT == MVT::v2i16 || VT == MVT::v2f16) { + if (Opc == ISD::BUILD_VECTOR) { + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + uint32_t K = LHSVal | (RHSVal << 16); + CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT, + CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32)); + return; + } + } + + break; + } + assert(EltVT.bitsEq(MVT::i32)); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { RegClassID = selectSGPRVectorRegClassID(NumVectorElts); } else { @@ -502,7 +568,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::CopyToReg: { const SITargetLowering& Lowering = *static_cast<const SITargetLowering*>(getTargetLowering()); - Lowering.legalizeTargetIndependentNode(N, *CurDAG); + N = Lowering.legalizeTargetIndependentNode(N, *CurDAG); break; } case ISD::AND: @@ -531,9 +597,9 @@ bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { if (!N->readMem()) return false; if (CbId == -1) - return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; + return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS; - return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId; + return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId; } bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { @@ -689,6 +755,17 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { CurDAG->RemoveDeadNode(N); } +void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { + // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned + // carry out despite the _i32 name. These were renamed in VI to _U32. + // FIXME: We should probably rename the opcodes here. + unsigned Opc = N->getOpcode() == ISD::UADDO ? + AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), + { N->getOperand(0), N->getOperand(1) }); +} + void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { SDLoc SL(N); // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod @@ -881,8 +958,12 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, return true; } +static bool isLegalMUBUFImmOffset(unsigned Imm) { + return isUInt<12>(Imm); +} + static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { - return isUInt<12>(Imm->getZExtValue()); + return isLegalMUBUFImmOffset(Imm->getZExtValue()); } bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, @@ -998,43 +1079,111 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); } -SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { - if (auto FI = dyn_cast<FrameIndexSDNode>(N)) - return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); - return N; +static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { + auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); + return PSV && PSV->isStack(); +} + +std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { + const MachineFunction &MF = CurDAG->getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + if (auto FI = dyn_cast<FrameIndexSDNode>(N)) { + SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), + FI->getValueType(0)); + + // If we can resolve this to a frame index access, this is relative to the + // frame pointer SGPR. + return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(), + MVT::i32)); + } + + // If we don't know this private access is a local stack object, it needs to + // be relative to the entry point's scratch wave offset register. + return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(), + MVT::i32)); } -bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &ImmOffset) const { +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, + SDValue Addr, SDValue &Rsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &ImmOffset) const { SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); - SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); - // (add n0, c1) + if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + unsigned Imm = CAddr->getZExtValue(); + assert(!isLegalMUBUFImmOffset(Imm) && + "should have been selected by other pattern"); + + SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); + MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + DL, MVT::i32, HighBits); + VAddr = SDValue(MovHighBits, 0); + + // In a call sequence, stores to the argument stack area are relative to the + // stack pointer. + const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo(); + unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? + Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); + + SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); + ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); + return true; + } + if (CurDAG->isBaseWithConstantOffset(Addr)) { + // (add n0, c1) + SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); // Offsets in vaddr must be positive. ConstantSDNode *C1 = cast<ConstantSDNode>(N1); if (isLegalMUBUFImmOffset(C1)) { - VAddr = foldFrameIndex(N0); + std::tie(VAddr, SOffset) = foldFrameIndex(N0); ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; } } // (node) - VAddr = foldFrameIndex(Addr); + std::tie(VAddr, SOffset) = foldFrameIndex(Addr); ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); return true; } +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root, + SDValue Addr, + SDValue &SRsrc, + SDValue &SOffset, + SDValue &Offset) const { + ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr); + if (!CAddr || !isLegalMUBUFImmOffset(CAddr)) + return false; + + SDLoc DL(Addr); + MachineFunction &MF = CurDAG->getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + + const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo(); + unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? + Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); + + // FIXME: Get from MachinePointerInfo? We should only be using the frame + // offset if we know this is in a call sequence. + SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); + + Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); + return true; +} + bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, @@ -1167,23 +1316,35 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, return true; } -bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr, - SDValue &VAddr, - SDValue &SLC, - SDValue &TFE) const { +bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr, + SDValue &VAddr, + SDValue &Offset, + SDValue &SLC) const { + int64_t OffsetVal = 0; + + if (Subtarget->hasFlatInstOffsets() && + CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getZExtValue(); + if (isUInt<12>(COffsetVal)) { + Addr = N0; + OffsetVal = COffsetVal; + } + } + VAddr = Addr; - TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); + Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16); + SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); + return true; } -/// -/// \param EncodedOffset This is the immediate value that will be encoded -/// directly into the instruction. On SI/CI the \p EncodedOffset -/// will be in units of dwords and on VI+ it will be units of bytes. -static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST, - int64_t EncodedOffset) { - return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ? - isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset); +bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDValue Addr, + SDValue &VAddr, + SDValue &Offset, + SDValue &SLC) const { + return SelectFlatOffset(Addr, VAddr, Offset, SLC); } bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, @@ -1197,10 +1358,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDLoc SL(ByteOffsetNode); AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration(); int64_t ByteOffset = C->getSExtValue(); - int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ? - ByteOffset >> 2 : ByteOffset; + int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset); - if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) { + if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) { Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); Imm = true; return true; @@ -1481,7 +1641,7 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { MemSDNode *Mem = cast<MemSDNode>(N); unsigned AS = Mem->getAddressSpace(); - if (AS == AMDGPUAS::FLAT_ADDRESS) { + if (AS == AMDGPUASI.FLAT_ADDRESS) { SelectCode(N); return; } @@ -1545,7 +1705,6 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - Src = In; if (Src.getOpcode() == ISD::FNEG) { @@ -1559,42 +1718,29 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, } SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - return true; } -bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src, - SDValue &SrcMods) const { - bool Res = SelectVOP3Mods(In, Src, SrcMods); - return Res && cast<ConstantSDNode>(SrcMods)->isNullValue(); +bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + SelectVOP3Mods(In, Src, SrcMods); + return isNoNanSrc(Src); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { + if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG) + return false; + + Src = In; + return true; } bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const { SDLoc DL(In); - // FIXME: Handle Clamp and Omod - Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32); - Omod = CurDAG->getTargetConstant(0, DL, MVT::i32); - - return SelectVOP3Mods(In, Src, SrcMods); -} - -bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src, - SDValue &SrcMods, SDValue &Clamp, - SDValue &Omod) const { - bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod); - - return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() && - cast<ConstantSDNode>(Clamp)->isNullValue() && - cast<ConstantSDNode>(Omod)->isNullValue(); -} - -bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, - SDValue &SrcMods, - SDValue &Omod) const { - // FIXME: Handle Omod - Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); return SelectVOP3Mods(In, Src, SrcMods); } @@ -1607,6 +1753,117 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, + SDValue &Clamp, SDValue &Omod) const { + Src = In; + + SDLoc DL(In); + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); + + return true; +} + +static SDValue stripBitcast(SDValue Val) { + return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; +} + +// Figure out if this is really an extract of the high 16-bits of a dword. +static bool isExtractHiElt(SDValue In, SDValue &Out) { + In = stripBitcast(In); + if (In.getOpcode() != ISD::TRUNCATE) + return false; + + SDValue Srl = In.getOperand(0); + if (Srl.getOpcode() == ISD::SRL) { + if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { + if (ShiftAmt->getZExtValue() == 16) { + Out = stripBitcast(Srl.getOperand(0)); + return true; + } + } + } + + return false; +} + +// Look through operations that obscure just looking at the low 16-bits of the +// same register. +static SDValue stripExtractLoElt(SDValue In) { + if (In.getOpcode() == ISD::TRUNCATE) { + SDValue Src = In.getOperand(0); + if (Src.getValueType().getSizeInBits() == 32) + return stripBitcast(Src); + } + + return In; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + Src = In; + + if (Src.getOpcode() == ISD::FNEG) { + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + Src = Src.getOperand(0); + } + + if (Src.getOpcode() == ISD::BUILD_VECTOR) { + unsigned VecMods = Mods; + + SDValue Lo = stripBitcast(Src.getOperand(0)); + SDValue Hi = stripBitcast(Src.getOperand(1)); + + if (Lo.getOpcode() == ISD::FNEG) { + Lo = stripBitcast(Lo.getOperand(0)); + Mods ^= SISrcMods::NEG; + } + + if (Hi.getOpcode() == ISD::FNEG) { + Hi = stripBitcast(Hi.getOperand(0)); + Mods ^= SISrcMods::NEG_HI; + } + + if (isExtractHiElt(Lo, Lo)) + Mods |= SISrcMods::OP_SEL_0; + + if (isExtractHiElt(Hi, Hi)) + Mods |= SISrcMods::OP_SEL_1; + + Lo = stripExtractLoElt(Lo); + Hi = stripExtractLoElt(Hi); + + if (Lo == Hi && !isInlineImmediate(Lo.getNode())) { + // Really a scalar input. Just select from the low half of the register to + // avoid packing. + + Src = Lo; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + + Mods = VecMods; + } + + // Packed instructions do not have abs modifiers. + Mods |= SISrcMods::OP_SEL_1; + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp) const { + SDLoc SL(In); + + // FIXME: Handle clamp and op_sel + Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); + + return SelectVOP3PMods(In, Src, SrcMods); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 54caa2c..258b173 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -15,11 +15,13 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUCallLowering.h" #include "AMDGPUFrameLowering.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "R600MachineFunctionInfo.h" +#include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" @@ -28,7 +30,7 @@ #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" -#include "SIInstrInfo.h" +#include "llvm/Support/KnownBits.h" using namespace llvm; static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, @@ -43,6 +45,76 @@ static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, return true; } +static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State, + const TargetRegisterClass *RC, + unsigned NumRegs) { + ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs); + unsigned RegResult = State.AllocateReg(RegList); + if (RegResult == AMDGPU::NoRegister) + return false; + + State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo)); + return true; +} + +static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + switch (LocVT.SimpleTy) { + case MVT::i64: + case MVT::f64: + case MVT::v2i32: + case MVT::v2f32: { + // Up to SGPR0-SGPR39 + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::SGPR_64RegClass, 20); + } + default: + return false; + } +} + +// Allocate up to VGPR31. +// +// TODO: Since there are no VGPR alignent requirements would it be better to +// split into individual scalar registers? +static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + switch (LocVT.SimpleTy) { + case MVT::i64: + case MVT::f64: + case MVT::v2i32: + case MVT::v2f32: { + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::VReg_64RegClass, 31); + } + case MVT::v4i32: + case MVT::v4f32: + case MVT::v2i64: + case MVT::v2f64: { + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::VReg_128RegClass, 29); + } + case MVT::v8i32: + case MVT::v8f32: { + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::VReg_256RegClass, 25); + + } + case MVT::v16i32: + case MVT::v16f32: { + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::VReg_512RegClass, 17); + + } + default: + return false; + } +} + #include "AMDGPUGenCallingConv.inc" // Find a larger type to do a load / store of a vector with. @@ -55,9 +127,33 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } +bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op) +{ + assert(Op.getOpcode() == ISD::OR); + + SDValue N0 = Op->getOperand(0); + SDValue N1 = Op->getOperand(1); + EVT VT = N0.getValueType(); + + if (VT.isInteger() && !VT.isVector()) { + KnownBits LHSKnown, RHSKnown; + DAG.computeKnownBits(N0, LHSKnown); + + if (LHSKnown.Zero.getBoolValue()) { + DAG.computeKnownBits(N1, RHSKnown); + + if (!(~RHSKnown.Zero & ~LHSKnown.Zero)) + return true; + } + } + + return false; +} + AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { + AMDGPUASI = AMDGPU::getAMDGPUAS(TM); // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::LOAD, MVT::f32, Promote); @@ -211,10 +307,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // This is totally unsupported, just custom lower to produce an error. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); - // We need to custom lower some of the intrinsics - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - // Library functions. These default to Expand, but we have instructions // for them. setOperationAction(ISD::FCEIL, MVT::f32, Legal); @@ -270,6 +362,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { @@ -460,10 +553,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // N > 4 stores on the same chain. GatherAllAliasesMaxDepth = 16; - // FIXME: Need to really handle these. - MaxStoresPerMemcpy = 4096; - MaxStoresPerMemmove = 4096; - MaxStoresPerMemset = 4096; + // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry + // about these during lowering. + MaxStoresPerMemcpy = 0xffffffff; + MaxStoresPerMemmove = 0xffffffff; + MaxStoresPerMemset = 0xffffffff; setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::SHL); @@ -478,12 +572,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); + setTargetDAGCombine(ISD::FABS); + setTargetDAGCombine(ISD::AssertZext); + setTargetDAGCombine(ISD::AssertSext); } //===----------------------------------------------------------------------===// // Target Information //===----------------------------------------------------------------------===// +LLVM_READNONE static bool fnegFoldsIntoOp(unsigned Opc) { switch (Opc) { case ISD::FADD: @@ -491,17 +589,83 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case ISD::FMUL: case ISD::FMA: case ISD::FMAD: + case ISD::FMINNUM: + case ISD::FMAXNUM: case ISD::FSIN: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::SIN_HW: case AMDGPUISD::FMUL_LEGACY: + case AMDGPUISD::FMIN_LEGACY: + case AMDGPUISD::FMAX_LEGACY: return true; default: return false; } } +/// \p returns true if the operation will definitely need to use a 64-bit +/// encoding, and thus will use a VOP3 encoding regardless of the source +/// modifiers. +LLVM_READONLY +static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { + return N->getNumOperands() > 2 || VT == MVT::f64; +} + +// Most FP instructions support source modifiers, but this could be refined +// slightly. +LLVM_READONLY +static bool hasSourceMods(const SDNode *N) { + if (isa<MemSDNode>(N)) + return false; + + switch (N->getOpcode()) { + case ISD::CopyToReg: + case ISD::SELECT: + case ISD::FDIV: + case ISD::FREM: + case ISD::INLINEASM: + case AMDGPUISD::INTERP_P1: + case AMDGPUISD::INTERP_P2: + case AMDGPUISD::DIV_SCALE: + + // TODO: Should really be looking at the users of the bitcast. These are + // problematic because bitcasts are used to legalize all stores to integer + // types. + case ISD::BITCAST: + return false; + default: + return true; + } +} + +bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, + unsigned CostThreshold) { + // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus + // it is truly free to use a source modifier in all cases. If there are + // multiple users but for each one will necessitate using VOP3, there will be + // a code size increase. Try to avoid increasing code size unless we know it + // will save on the instruction count. + unsigned NumMayIncreaseSize = 0; + MVT VT = N->getValueType(0).getScalarType().getSimpleVT(); + + // XXX - Should this limit number of uses to check? + for (const SDNode *U : N->uses()) { + if (!hasSourceMods(U)) + return false; + + if (!opMustUseVOP3Encoding(U, VT)) { + if (++NumMayIncreaseSize > CostThreshold) + return false; + } + } + + return true; +} + MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { return MVT::i32; } @@ -580,12 +744,17 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() && - VT == MVT::f16); + + // Packed operations do not have a fabs modifier. + return VT == MVT::f32 || VT == MVT::f64 || + (Subtarget->has16BitInsts() && VT == MVT::f16); } bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { - return isFAbsFree(VT); + assert(VT.isFloatingPoint()); + return VT == MVT::f32 || VT == MVT::f64 || + (Subtarget->has16BitInsts() && VT == MVT::f16) || + (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16); } bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, @@ -667,6 +836,46 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // TargetLowering Callbacks //===---------------------------------------------------------------------===// +CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, + bool IsVarArg) { + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return CC_AMDGPU_Kernel; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_HS: + return CC_AMDGPU; + case CallingConv::C: + case CallingConv::Fast: + return CC_AMDGPU_Func; + default: + report_fatal_error("Unsupported calling convention."); + } +} + +CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, + bool IsVarArg) { + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return CC_AMDGPU_Kernel; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_HS: + return RetCC_SI_Shader; + case CallingConv::C: + case CallingConv::Fast: + return RetCC_AMDGPU_Func; + default: + report_fatal_error("Unsupported calling convention."); + } +} + /// The SelectionDAGBuilder will automatically promote function arguments /// with illegal types. However, this does not work for the AMDGPU targets /// since the function arguments are stored in memory as these illegal types. @@ -676,7 +885,7 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting /// input values across multiple registers. Each item in the Ins array -/// represents a single value that will be stored in regsters. Ins[x].VT is +/// represents a single value that will be stored in registers. Ins[x].VT is /// the value type of the value that will be stored in the register, so /// whatever SDNode we lower the argument to needs to be this type. /// @@ -764,23 +973,15 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State, } } -void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, - const SmallVectorImpl<ISD::InputArg> &Ins) const { - State.AnalyzeFormalArguments(Ins, CC_AMDGPU); -} - -void AMDGPUTargetLowering::AnalyzeReturn(CCState &State, - const SmallVectorImpl<ISD::OutputArg> &Outs) const { - - State.AnalyzeReturn(Outs, RetCC_SI); -} - -SDValue -AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SDLoc &DL, SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerReturn( + SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SDLoc &DL, SelectionDAG &DAG) const { + // FIXME: Fails for r600 tests + //assert(!isVarArg && Outs.empty() && OutVals.empty() && + // "wave terminate should not have return values"); return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); } @@ -788,6 +989,17 @@ AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Target specific lowering //===---------------------------------------------------------------------===// +/// Selects the correct CCAssignFn for a given CallingConvention value. +CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, + bool IsVarArg) { + return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg); +} + +CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, + bool IsVarArg) { + return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); +} + SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { SDValue Callee = CLI.Callee; @@ -829,14 +1041,13 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: - Op->dump(&DAG); + Op->print(errs(), &DAG); llvm_unreachable("Custom lowering code for this" "instruction is not implemented yet!"); break; case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); - case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); case ISD::FREM: return LowerFREM(Op, DAG); @@ -892,19 +1103,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = G->getGlobal(); - switch (G->getAddressSpace()) { - case AMDGPUAS::LOCAL_ADDRESS: { + if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) { // XXX: What does the value of G->getOffset() mean? assert(G->getOffset() == 0 && "Do not know what to do with an non-zero offset"); // TODO: We could emit code to handle the initialization somewhere. - if (hasDefinedInitializer(GV)) - break; - - unsigned Offset = MFI->allocateLDSGlobal(DL, *GV); - return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); - } + if (!hasDefinedInitializer(GV)) { + unsigned Offset = MFI->allocateLDSGlobal(DL, *GV); + return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); + } } const Function &Fn = *DAG.getMachineFunction().getFunction(); @@ -936,41 +1144,12 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); } -SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, - SelectionDAG &DAG) const { - unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - switch (IntrinsicID) { - default: return Op; - case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name. - return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfe_i32: - return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfe_u32: - return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - } -} - /// \brief Generate Min/Max node -SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, +SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const { - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return SDValue(); - if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) return SDValue(); @@ -1228,7 +1407,10 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa); + unsigned OpCode = Subtarget->hasFP32Denormals() ? + (unsigned)AMDGPUISD::FMAD_FTZ : + (unsigned)ISD::FMAD; + SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); // int iq = (int)fq; SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); @@ -1662,32 +1844,37 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con } // XXX - May require not supporting f32 denormals? -SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const { + +// Don't handle v2f16. The extra instructions to scalarize and repack around the +// compare and vselect end up producing worse code than scalarizing the whole +// operation. +SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue X = Op.getOperand(0); + EVT VT = Op.getValueType(); - SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); + SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X); // TODO: Should this propagate fast-math-flags? - SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); + SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T); - SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); + SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff); - const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32); - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32); + const SDValue Zero = DAG.getConstantFP(0.0, SL, VT); + const SDValue One = DAG.getConstantFP(1.0, SL, VT); + const SDValue Half = DAG.getConstantFP(0.5, SL, VT); - SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); + SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X); EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); - SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero); + SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero); - return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel); + return DAG.getNode(ISD::FADD, SL, VT, T, Sel); } SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { @@ -1750,8 +1937,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - if (VT == MVT::f32) - return LowerFROUND32(Op, DAG); + if (VT == MVT::f32 || VT == MVT::f16) + return LowerFROUND32_16(Op, DAG); if (VT == MVT::f64) return LowerFROUND64(Op, DAG); @@ -2030,15 +2217,19 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, } SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue N0 = Op.getOperand(0); + + // Convert to target node to get known bits + if (N0.getValueType() == MVT::f32) + return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); if (getTargetMachine().Options.UnsafeFPMath) { // There is a generic expand for FP_TO_FP16 with unsafe fast math. return SDValue(); } - SDLoc DL(Op); - SDValue N0 = Op.getOperand(0); - assert (N0.getSimpleValueType() == MVT::f64); + assert(N0.getSimpleValueType() == MVT::f64); // f64 -> f16 conversion using round-to-nearest-even rounding mode. const unsigned ExpMask = 0x7ff; @@ -2198,11 +2389,11 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, //===----------------------------------------------------------------------===// static bool isU24(SDValue Op, SelectionDAG &DAG) { - APInt KnownZero, KnownOne; + KnownBits Known; EVT VT = Op.getValueType(); - DAG.computeKnownBits(Op, KnownZero, KnownOne); + DAG.computeKnownBits(Op, Known); - return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; + return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24; } static bool isI24(SDValue Op, SelectionDAG &DAG) { @@ -2220,12 +2411,13 @@ static bool simplifyI24(SDNode *Node24, unsigned OpIdx, SelectionDAG &DAG = DCI.DAG; SDValue Op = Node24->getOperand(OpIdx); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = Op.getValueType(); APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, true, true); - if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI)) + if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO)) return true; return false; @@ -2379,6 +2571,53 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SN->getBasePtr(), SN->getMemOperand()); } +SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); + if (!CSrc) + return SDValue(); + + const APFloat &F = CSrc->getValueAPF(); + APFloat Zero = APFloat::getZero(F.getSemantics()); + APFloat::cmpResult Cmp0 = F.compare(Zero); + if (Cmp0 == APFloat::cmpLessThan || + (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { + return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); + } + + APFloat One(F.getSemantics(), "1.0"); + APFloat::cmpResult Cmp1 = F.compare(One); + if (Cmp1 == APFloat::cmpGreaterThan) + return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); + + return SDValue(CSrc, 0); +} + +// FIXME: This should go in generic DAG combiner with an isTruncateFree check, +// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU +// issues. +SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + + // (vt2 (assertzext (truncate vt0:x), vt1)) -> + // (vt2 (truncate (assertzext vt0:x, vt1))) + if (N0.getOpcode() == ISD::TRUNCATE) { + SDValue N1 = N->getOperand(1); + EVT ExtVT = cast<VTSDNode>(N1)->getVT(); + SDLoc SL(N); + + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + if (SrcVT.bitsGE(ExtVT)) { + SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1); + return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg); + } + } + + return SDValue(); +} /// Split the 64-bit value \p LHS into two 32-bit components, and perform the /// binary operation \p Opc to it with the corresponding constant operands. SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( @@ -2406,7 +2645,57 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const { - if (N->getValueType(0) != MVT::i64) + EVT VT = N->getValueType(0); + + ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!RHS) + return SDValue(); + + SDValue LHS = N->getOperand(0); + unsigned RHSVal = RHS->getZExtValue(); + if (!RHSVal) + return LHS; + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + + switch (LHS->getOpcode()) { + default: + break; + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ANY_EXTEND: { + // shl (ext x) => zext (shl x), if shift does not overflow int + if (VT != MVT::i64) + break; + KnownBits Known; + SDValue X = LHS->getOperand(0); + DAG.computeKnownBits(X, Known); + unsigned LZ = Known.countMinLeadingZeros(); + if (LZ < RHSVal) + break; + EVT XVT = X.getValueType(); + SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); + return DAG.getZExtOrTrunc(Shl, SL, VT); + } + case ISD::OR: + if (!isOrEquivalentToAdd(DAG, LHS)) + break; + LLVM_FALLTHROUGH; + case ISD::ADD: { + // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) + if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { + SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0), + SDValue(RHS, 0)); + SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal, + SDLoc(C2), VT); + return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V); + } + break; + } + } + + if (VT != MVT::i64) return SDValue(); // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) @@ -2414,19 +2703,9 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, // On some subtargets, 64-bit shift is a quarter rate instruction. In the // common case, splitting this into a move and a 32-bit shift is faster and // the same code size. - const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (!RHS) - return SDValue(); - - unsigned RHSVal = RHS->getZExtValue(); if (RHSVal < 32) return SDValue(); - SDValue LHS = N->getOperand(0); - - SDLoc SL(N); - SelectionDAG &DAG = DCI.DAG; - SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); @@ -2821,20 +3100,41 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC); return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True); } - } - if (VT == MVT::f32 && Cond.hasOneUse()) { - SDValue MinMax - = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); - // Revisit this node so we can catch min3/max3/med3 patterns. - //DCI.AddToWorklist(MinMax.getNode()); - return MinMax; + if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) { + SDValue MinMax + = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + // Revisit this node so we can catch min3/max3/med3 patterns. + //DCI.AddToWorklist(MinMax.getNode()); + return MinMax; + } } // There's no reason to not do this if the condition has other uses. return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); } +static bool isConstantFPZero(SDValue N) { + if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) + return C->isZero() && !C->isNegative(); + return false; +} + +static unsigned inverseMinMax(unsigned Opc) { + switch (Opc) { + case ISD::FMAXNUM: + return ISD::FMINNUM; + case ISD::FMINNUM: + return ISD::FMAXNUM; + case AMDGPUISD::FMAX_LEGACY: + return AMDGPUISD::FMIN_LEGACY; + case AMDGPUISD::FMIN_LEGACY: + return AMDGPUISD::FMAX_LEGACY; + default: + llvm_unreachable("invalid min/max opcode"); + } +} + SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2847,10 +3147,16 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, // the other uses cannot, give up. This both prevents unprofitable // transformations and infinite loops: we won't repeatedly try to fold around // a negate that has no 'good' form. - // - // TODO: Check users can fold - if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse()) - return SDValue(); + if (N0.hasOneUse()) { + // This may be able to fold into the source, but at a code size cost. Don't + // fold if the fold into the user is free. + if (allUsesHaveSourceMods(N, 0)) + return SDValue(); + } else { + if (fnegFoldsIntoOp(Opc) && + (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode()))) + return SDValue(); + } SDLoc SL(N); switch (Opc) { @@ -2872,7 +3178,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, else RHS = RHS.getOperand(0); - SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS); + SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags()); if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -2891,7 +3197,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, else RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); - SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS); + SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags()); if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -2923,10 +3229,40 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; } + case ISD::FMAXNUM: + case ISD::FMINNUM: + case AMDGPUISD::FMAX_LEGACY: + case AMDGPUISD::FMIN_LEGACY: { + // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) + // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y) + // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y) + // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y) + + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + + // 0 doesn't have a negated inline immediate. + // TODO: Shouldn't fold 1/2pi either, and should be generalized to other + // operations. + if (isConstantFPZero(RHS)) + return SDValue(); + + SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); + SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + unsigned Opposite = inverseMinMax(Opc); + + SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags()); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } case ISD::FP_EXTEND: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: // XXX - Should fround be handled? + case ISD::FSIN: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: - case ISD::FSIN: case AMDGPUISD::SIN_HW: { SDValue CvtSrc = N0.getOperand(0); if (CvtSrc.getOpcode() == ISD::FNEG) { @@ -2941,7 +3277,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, // (fneg (fp_extend x)) -> (fp_extend (fneg x)) // (fneg (rcp x)) -> (rcp (fneg x)) SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); - return DAG.getNode(Opc, SL, VT, Neg); + return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags()); } case ISD::FP_ROUND: { SDValue CvtSrc = N0.getOperand(0); @@ -2959,6 +3295,45 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); } + case ISD::FP16_TO_FP: { + // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal + // f16, but legalization of f16 fneg ends up pulling it out of the source. + // Put the fneg back as a legal source operation that can be matched later. + SDLoc SL(N); + + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000) + SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src, + DAG.getConstant(0x8000, SL, SrcVT)); + return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); + } + default: + return SDValue(); + } +} + +SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + + if (!N0.hasOneUse()) + return SDValue(); + + switch (N0.getOpcode()) { + case ISD::FP16_TO_FP: { + assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal"); + SDLoc SL(N); + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff) + SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src, + DAG.getConstant(0x7fff, SL, SrcVT)); + return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs); + } default: return SDValue(); } @@ -3071,6 +3446,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performSelectCombine(N, DCI); case ISD::FNEG: return performFNegCombine(N, DCI); + case ISD::FABS: + return performFAbsCombine(N, DCI); case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && @@ -3131,7 +3508,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DL); } - if ((OffsetVal + WidthVal) >= 32) { + if ((OffsetVal + WidthVal) >= 32 && + !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) { SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, BitsFrom, ShiftVal); @@ -3142,13 +3520,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, OffsetVal, OffsetVal + WidthVal); - APInt KnownZero, KnownOne; + KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || - TLI.SimplifyDemandedBits(BitsFrom, Demanded, - KnownZero, KnownOne, TLO)) { + if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) || + TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) { DCI.CommitTargetLoweringOpt(TLO); } } @@ -3159,6 +3536,21 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performLoadCombine(N, DCI); case ISD::STORE: return performStoreCombine(N, DCI); + case AMDGPUISD::CLAMP: + return performClampCombine(N, DCI); + case AMDGPUISD::RCP: { + if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) { + // XXX - Should this flush denormals? + const APFloat &Val = CFP->getValueAPF(); + APFloat One(Val.getSemantics(), "1.0"); + return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); + } + + break; + } + case ISD::AssertZext: + case ISD::AssertSext: + return performAssertSZExtCombine(N, DCI); } return SDValue(); } @@ -3168,18 +3560,25 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, //===----------------------------------------------------------------------===// SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const { + const TargetRegisterClass *RC, + unsigned Reg, EVT VT, + const SDLoc &SL, + bool RawReg) const { MachineFunction &MF = DAG.getMachineFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned VirtualRegister; + unsigned VReg; + if (!MRI.isLiveIn(Reg)) { - VirtualRegister = MRI.createVirtualRegister(RC); - MRI.addLiveIn(Reg, VirtualRegister); + VReg = MRI.createVirtualRegister(RC); + MRI.addLiveIn(Reg, VReg); } else { - VirtualRegister = MRI.getLiveInVirtReg(Reg); + VReg = MRI.getLiveInVirtReg(Reg); } - return DAG.getRegister(VirtualRegister, VT); + + if (RawReg) + return DAG.getRegister(VReg, VT); + + return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); } uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( @@ -3201,13 +3600,18 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((AMDGPUISD::NodeType)Opcode) { case AMDGPUISD::FIRST_NUMBER: break; // AMDIL DAG nodes - NODE_NAME_CASE(CALL); NODE_NAME_CASE(UMUL); NODE_NAME_CASE(BRANCH_COND); // AMDGPU DAG nodes + NODE_NAME_CASE(IF) + NODE_NAME_CASE(ELSE) + NODE_NAME_CASE(LOOP) + NODE_NAME_CASE(CALL) + NODE_NAME_CASE(TRAP) + NODE_NAME_CASE(RET_FLAG) + NODE_NAME_CASE(RETURN_TO_EPILOG) NODE_NAME_CASE(ENDPGM) - NODE_NAME_CASE(RETURN) NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(SETCC) @@ -3232,6 +3636,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) NODE_NAME_CASE(DIV_FIXUP) + NODE_NAME_CASE(FMAD_FTZ) NODE_NAME_CASE(TRIG_PREOP) NODE_NAME_CASE(RCP) NODE_NAME_CASE(RSQ) @@ -3265,7 +3670,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) NODE_NAME_CASE(REGISTER_STORE) - NODE_NAME_CASE(LOAD_INPUT) NODE_NAME_CASE(SAMPLE) NODE_NAME_CASE(SAMPLEB) NODE_NAME_CASE(SAMPLED) @@ -3274,12 +3678,17 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_F32_UBYTE1) NODE_NAME_CASE(CVT_F32_UBYTE2) NODE_NAME_CASE(CVT_F32_UBYTE3) + NODE_NAME_CASE(CVT_PKRTZ_F16_F32) + NODE_NAME_CASE(FP_TO_FP16) + NODE_NAME_CASE(FP16_ZEXT) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; + NODE_NAME_CASE(INIT_EXEC) + NODE_NAME_CASE(INIT_EXEC_FROM_INPUT) NODE_NAME_CASE(SENDMSG) NODE_NAME_CASE(SENDMSGHALT) NODE_NAME_CASE(INTERP_MOV) @@ -3288,6 +3697,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) + NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) @@ -3338,16 +3749,12 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, } void AMDGPUTargetLowering::computeKnownBitsForTargetNode( - const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth) const { + const SDValue Op, KnownBits &Known, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { - KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. + Known.resetAll(); // Don't know anything. - APInt KnownZero2; - APInt KnownOne2; + KnownBits Known2; unsigned Opc = Op.getOpcode(); switch (Opc) { @@ -3355,7 +3762,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( break; case AMDGPUISD::CARRY: case AMDGPUISD::BORROW: { - KnownZero = APInt::getHighBitsSet(32, 31); + Known.Zero = APInt::getHighBitsSet(32, 31); break; } @@ -3365,21 +3772,27 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( if (!CWidth) return; - unsigned BitWidth = 32; uint32_t Width = CWidth->getZExtValue() & 0x1f; if (Opc == AMDGPUISD::BFE_U32) - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); + Known.Zero = APInt::getHighBitsSet(32, 32 - Width); + + break; + } + case AMDGPUISD::FP_TO_FP16: + case AMDGPUISD::FP16_ZEXT: { + unsigned BitWidth = Known.getBitWidth(); + // High bits are zero. + Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); break; } } } unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( - SDValue Op, - const SelectionDAG &DAG, - unsigned Depth) const { + SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, + unsigned Depth) const { switch (Op.getOpcode()) { case AMDGPUISD::BFE_I32: { ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); @@ -3403,7 +3816,9 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( case AMDGPUISD::CARRY: case AMDGPUISD::BORROW: return 31; - + case AMDGPUISD::FP_TO_FP16: + case AMDGPUISD::FP16_ZEXT: + return 16; default: return 1; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index f6adcea..d85aada 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -16,6 +16,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H +#include "AMDGPU.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/Target/TargetLowering.h" namespace llvm { @@ -32,12 +34,15 @@ private: /// compare. SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const; +public: + static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op); + protected: const AMDGPUSubtarget *Subtarget; + AMDGPUAS AMDGPUASI; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; /// \brief Split a vector store into multiple scalar stores. /// \returns The resulting chain. @@ -47,7 +52,7 @@ protected: SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; @@ -70,6 +75,8 @@ protected: bool shouldCombineMemoryType(EVT VT) const; SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, @@ -85,6 +92,7 @@ protected: SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const; static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); @@ -111,24 +119,22 @@ protected: SmallVectorImpl<SDValue> &Results) const; void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl<ISD::InputArg> &Ins) const; - void AnalyzeFormalArguments(CCState &State, - const SmallVectorImpl<ISD::InputArg> &Ins) const; - void AnalyzeReturn(CCState &State, - const SmallVectorImpl<ISD::OutputArg> &Outs) const; - public: AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); bool mayIgnoreSignedZero(SDValue Op) const { - if (getTargetMachine().Options.UnsafeFPMath) // FIXME: nsz only + if (getTargetMachine().Options.NoSignedZerosFPMath) return true; - if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(Op)) - return BO->Flags.hasNoSignedZeros(); + const auto Flags = Op.getNode()->getFlags(); + if (Flags.isDefined()) + return Flags.hasNoSignedZeros(); return false; } + static bool allUsesHaveSourceMods(const SDNode *N, + unsigned CostThreshold = 4); bool isFAbsFree(EVT VT) const override; bool isFNegFree(EVT VT) const override; bool isTruncateFree(EVT Src, EVT Dest) const override; @@ -158,6 +164,9 @@ public: bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; + static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); + static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, @@ -174,7 +183,7 @@ public: SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; - SDValue CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, + SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const; @@ -196,21 +205,37 @@ public: /// either zero or one and return them in the \p KnownZero and \p KnownOne /// bitsets. void computeKnownBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, + KnownBits &Known, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const override; - unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG, + unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, + const SelectionDAG &DAG, unsigned Depth = 0) const override; /// \brief Helper function that adds Reg to the LiveIn list of the DAG's /// MachineFunction. /// - /// \returns a RegisterSDNode representing Reg. - virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const; + /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise + /// a copy from the register. + SDValue CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT, + const SDLoc &SL, + bool RawReg = false) const; + SDValue CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode())); + } + + // Returns the raw live in register rather than a copy from it. + SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true); + } enum ImplicitParameter { FIRST_IMPLICIT, @@ -222,6 +247,14 @@ public: /// type of implicit parameter. uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const; + + AMDGPUAS getAMDGPUAS() const { + return AMDGPUASI; + } + + MVT getFenceOperandTy(const DataLayout &DL) const override { + return MVT::i32; + } }; namespace AMDGPUISD { @@ -229,15 +262,35 @@ namespace AMDGPUISD { enum NodeType : unsigned { // AMDIL ISD Opcodes FIRST_NUMBER = ISD::BUILTIN_OP_END, - CALL, // Function call based on a single integer UMUL, // 32bit unsigned multiplication BRANCH_COND, // End AMDIL ISD Opcodes + + // Function call. + CALL, + TRAP, + + // Masked control flow nodes. + IF, + ELSE, + LOOP, + + // A uniform kernel return that terminates the wavefront. ENDPGM, - RETURN, + + // Return to a shader part's epilog code. + RETURN_TO_EPILOG, + + // Return with values from a non-entry function. + RET_FLAG, + DWORDADDR, FRACT, + + /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output + /// modifier behavior with dx10_enable. CLAMP, + // This is SETCC with the full mask result which is used for a compare with a // result bit per item in the wavefront. SETCC, @@ -265,6 +318,9 @@ enum NodeType : unsigned { DIV_SCALE, DIV_FMAS, DIV_FIXUP, + // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is + // treated as an illegal operation. + FMAD_FTZ, TRIG_PREOP, // 1 ULP max error for f64 // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. @@ -301,7 +357,6 @@ enum NodeType : unsigned { CONST_ADDRESS, REGISTER_LOAD, REGISTER_STORE, - LOAD_INPUT, SAMPLE, SAMPLEB, SAMPLED, @@ -312,6 +367,18 @@ enum NodeType : unsigned { CVT_F32_UBYTE1, CVT_F32_UBYTE2, CVT_F32_UBYTE3, + + // Convert two float 32 numbers into a single register holding two packed f16 + // with round to zero. + CVT_PKRTZ_F16_F32, + + // Same as the standard node, except the high bits of the resulting integer + // are known 0. + FP_TO_FP16, + + // Wrapper around fp16 results that are known to zero the high bits. + FP16_ZEXT, + /// This node is for VLIW targets and it is used to represent a vector /// that is stored in consecutive registers with the same channel. /// For example: @@ -323,6 +390,8 @@ enum NodeType : unsigned { BUILD_VERTICAL_VECTOR, /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, + INIT_EXEC, + INIT_EXEC_FROM_INPUT, SENDMSG, SENDMSGHALT, INTERP_MOV, @@ -335,6 +404,8 @@ enum NodeType : unsigned { STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, + TBUFFER_STORE_FORMAT_X3, + TBUFFER_LOAD_FORMAT, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index e4dc659..69dc529 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -30,7 +30,7 @@ using namespace llvm; void AMDGPUInstrInfo::anchor() {} AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST) - : AMDGPUGenInstrInfo(-1, -1), ST(ST) {} + : AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {} // FIXME: This behaves strangely. If, for example, you have 32 load + stores, // the first 16 loads will be interleaved with the stores, and the next 16 will @@ -66,7 +66,9 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td enum SIEncodingFamily { SI = 0, - VI = 1 + VI = 1, + SDWA = 2, + SDWA9 = 3 }; // Wrapper for Tablegen'd function. enum Subtarget is not defined in any @@ -86,6 +88,7 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { case AMDGPUSubtarget::SEA_ISLANDS: return SIEncodingFamily::SI; case AMDGPUSubtarget::VOLCANIC_ISLANDS: + case AMDGPUSubtarget::GFX9: return SIEncodingFamily::VI; // FIXME: This should never be called for r600 GPUs. @@ -100,7 +103,12 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { } int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { - int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST)); + SIEncodingFamily Gen = subtargetEncodingFamily(ST); + if (get(Opcode).TSFlags & SIInstrFlags::SDWA) + Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9 + : SIEncodingFamily::SDWA; + + int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); // -1 means that Opcode is already a native instruction. if (MCOp == -1) diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index bd8e389..41cc7d7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -16,11 +16,11 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H -#include "llvm/Target/TargetInstrInfo.h" +#include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/Target/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER -#define GET_INSTRINFO_ENUM #include "AMDGPUGenInstrInfo.inc" namespace llvm { @@ -35,6 +35,8 @@ private: const AMDGPUSubtarget &ST; virtual void anchor(); +protected: + AMDGPUAS AMDGPUASI; public: explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index d7fa28b..bcf89bb 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -31,6 +31,10 @@ def AMDGPUFPClassOp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>] >; +def AMDGPUFPPackOp : SDTypeProfile<1, 2, + [SDTCisFP<1>, SDTCisSameAs<1, 2>] +>; + def AMDGPUDivScaleOp : SDTypeProfile<2, 3, [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] >; @@ -42,10 +46,47 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4, def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def AMDGPUIfOp : SDTypeProfile<1, 2, + [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>] +>; + +def AMDGPUElseOp : SDTypeProfile<1, 2, + [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>] +>; + +def AMDGPULoopOp : SDTypeProfile<0, 2, + [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>] +>; + +def AMDGPUBreakOp : SDTypeProfile<1, 1, + [SDTCisVT<0, i64>, SDTCisVT<1, i64>] +>; + +def AMDGPUIfBreakOp : SDTypeProfile<1, 2, + [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>] +>; + +def AMDGPUElseBreakOp : SDTypeProfile<1, 2, + [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>] +>; + +def AMDGPUAddeSubeOp : SDTypeProfile<2, 3, + [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>] +>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // +def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>; +def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>; +def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>; + +def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", + SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>, + [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue] +>; + def AMDGPUconstdata_ptr : SDNode< "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<0, iPTR>]> @@ -78,6 +119,11 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; +def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; +def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; +def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>; + + def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; // out = max(a, b) a and b are floats, where a nan comparison fails. @@ -92,17 +138,7 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp, [SDNPCommutative, SDNPAssociative] >; -def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; - -// out = max(a, b) a and b are signed ints -def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, - [SDNPCommutative, SDNPAssociative] ->; - -// out = max(a, b) a and b are unsigned ints -def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, - [SDNPCommutative, SDNPAssociative] ->; +def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; // out = min(a, b) a and b are floats, where a nan comparison fails. def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, @@ -147,6 +183,12 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; // out = (src1 > src0) ? 1 : 0 def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>; +// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own +// nodes in TargetSelectionDAG.td. +def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>; + +def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>; + def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> ]>; @@ -194,6 +236,8 @@ def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>; // Denominator, src2 = Numerator). def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; +def AMDGPUfmad_ftz : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>; + // Look Up 2.0 / pi src0 with segment select src1[4:0] def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; @@ -265,6 +309,15 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; +def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC", + SDTypeProfile<0, 1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPInGlue]>; + +def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT", + SDTypeProfile<0, 2, + [SDTCisInt<0>, SDTCisInt<1>]>, + [SDNPHasChain, SDNPInGlue]>; + def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPInGlue]>; @@ -291,15 +344,16 @@ def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT, // SI+ export def AMDGPUExportOp : SDTypeProfile<0, 8, [ - SDTCisInt<0>, // i8 en - SDTCisInt<1>, // i1 vm + SDTCisInt<0>, // i8 tgt + SDTCisInt<1>, // i8 en + // i32 or f32 src0 + SDTCisSameAs<3, 2>, // f32 src1 + SDTCisSameAs<4, 2>, // f32 src2 + SDTCisSameAs<5, 2>, // f32 src3 + SDTCisInt<6>, // i1 compr // skip done - SDTCisInt<2>, // i8 tgt - SDTCisSameAs<3, 1>, // i1 compr - SDTCisFP<4>, // f32 src0 - SDTCisSameAs<5, 4>, // f32 src1 - SDTCisSameAs<6, 4>, // f32 src2 - SDTCisSameAs<7, 4> // f32 src3 + SDTCisInt<1> // i1 vm + ]>; def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp, @@ -333,5 +387,9 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; -def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone, +def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +>; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp new file mode 100644 index 0000000..e54c887 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -0,0 +1,425 @@ +//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the InstructionSelector class for +/// AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstructionSelector.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPURegisterBankInfo.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-isel" + +using namespace llvm; + +AMDGPUInstructionSelector::AMDGPUInstructionSelector( + const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI) + : InstructionSelector(), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI), AMDGPUASI(STI.getAMDGPUAS()) {} + +MachineOperand +AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, + unsigned SubIdx) const { + + MachineInstr *MI = MO.getParent(); + MachineBasicBlock *BB = MO.getParent()->getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + if (MO.isReg()) { + unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); + unsigned Reg = MO.getReg(); + BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) + .addReg(Reg, 0, ComposedSubIdx); + + return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), + MO.isKill(), MO.isDead(), MO.isUndef(), + MO.isEarlyClobber(), 0, MO.isDebug(), + MO.isInternalRead()); + } + + assert(MO.isImm()); + + APInt Imm(64, MO.getImm()); + + switch (SubIdx) { + default: + llvm_unreachable("do not know to split immediate with this sub index."); + case AMDGPU::sub0: + return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); + case AMDGPU::sub1: + return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); + } +} + +bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned Size = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); + unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + if (Size != 64) + return false; + + DebugLoc DL = I.getDebugLoc(); + + MachineOperand Lo1(getSubOperand64(I.getOperand(1), AMDGPU::sub0)); + MachineOperand Lo2(getSubOperand64(I.getOperand(2), AMDGPU::sub0)); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) + .add(Lo1) + .add(Lo2); + + MachineOperand Hi1(getSubOperand64(I.getOperand(1), AMDGPU::sub1)); + MachineOperand Hi2(getSubOperand64(I.getOperand(2), AMDGPU::sub1)); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) + .add(Hi1) + .add(Hi2); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), I.getOperand(0).getReg()) + .addReg(DstLo) + .addImm(AMDGPU::sub0) + .addReg(DstHi) + .addImm(AMDGPU::sub1); + + for (MachineOperand &MO : I.explicit_operands()) { + if (!MO.isReg() || TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + continue; + RBI.constrainGenericRegister(MO.getReg(), AMDGPU::SReg_64RegClass, MRI); + } + + I.eraseFromParent(); + return true; +} + +bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const { + return selectG_ADD(I); +} + +bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + DebugLoc DL = I.getDebugLoc(); + + // FIXME: Select store instruction based on address space + MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(AMDGPU::FLAT_STORE_DWORD)) + .add(I.getOperand(1)) + .add(I.getOperand(0)) + .addImm(0) // offset + .addImm(0) // glc + .addImm(0); // slc + + + // Now that we selected an opcode, we need to constrain the register + // operands to use appropriate classes. + bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); + + I.eraseFromParent(); + return Ret; +} + +bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned DstReg = I.getOperand(0).getReg(); + unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + + if (Size == 32) { + I.setDesc(TII.get(AMDGPU::S_MOV_B32)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + assert(Size == 64); + + DebugLoc DL = I.getDebugLoc(); + unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + const APInt &Imm = I.getOperand(1).getCImm()->getValue(); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), LoReg) + .addImm(Imm.trunc(32).getZExtValue()); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) + .addImm(Imm.ashr(32).getZExtValue()); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(LoReg) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + // We can't call constrainSelectedInstRegOperands here, because it doesn't + // work for target independent opcodes + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI); +} + +static bool isConstant(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_CONSTANT; +} + +void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, + const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { + + const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); + + assert(PtrMI); + + if (PtrMI->getOpcode() != TargetOpcode::G_GEP) + return; + + GEPInfo GEPInfo(*PtrMI); + + for (unsigned i = 1, e = 3; i < e; ++i) { + const MachineOperand &GEPOp = PtrMI->getOperand(i); + const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); + assert(OpDef); + if (isConstant(*OpDef)) { + // FIXME: Is it possible to have multiple Imm parts? Maybe if we + // are lacking other optimizations. + assert(GEPInfo.Imm == 0); + GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); + continue; + } + const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); + if (OpBank->getID() == AMDGPU::SGPRRegBankID) + GEPInfo.SgprParts.push_back(GEPOp.getReg()); + else + GEPInfo.VgprParts.push_back(GEPOp.getReg()); + } + + AddrInfo.push_back(GEPInfo); + getAddrModeInfo(*PtrMI, MRI, AddrInfo); +} + +static bool isInstrUniform(const MachineInstr &MI) { + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI.memoperands_begin(); + const Value *Ptr = MMO->getValue(); + + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers. + // If Ptr is null, then that means this mem operand contains a + // PseudoSourceValue like GOT. + if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || + isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) + return true; + + const Instruction *I = dyn_cast<Instruction>(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} + +static unsigned getSmrdOpcode(unsigned BaseOpcode, unsigned LoadSize) { + + if (LoadSize == 32) + return BaseOpcode; + + switch (BaseOpcode) { + case AMDGPU::S_LOAD_DWORD_IMM: + switch (LoadSize) { + case 64: + return AMDGPU::S_LOAD_DWORDX2_IMM; + case 128: + return AMDGPU::S_LOAD_DWORDX4_IMM; + case 256: + return AMDGPU::S_LOAD_DWORDX8_IMM; + case 512: + return AMDGPU::S_LOAD_DWORDX16_IMM; + } + break; + case AMDGPU::S_LOAD_DWORD_IMM_ci: + switch (LoadSize) { + case 64: + return AMDGPU::S_LOAD_DWORDX2_IMM_ci; + case 128: + return AMDGPU::S_LOAD_DWORDX4_IMM_ci; + case 256: + return AMDGPU::S_LOAD_DWORDX8_IMM_ci; + case 512: + return AMDGPU::S_LOAD_DWORDX16_IMM_ci; + } + break; + case AMDGPU::S_LOAD_DWORD_SGPR: + switch (LoadSize) { + case 64: + return AMDGPU::S_LOAD_DWORDX2_SGPR; + case 128: + return AMDGPU::S_LOAD_DWORDX4_SGPR; + case 256: + return AMDGPU::S_LOAD_DWORDX8_SGPR; + case 512: + return AMDGPU::S_LOAD_DWORDX16_SGPR; + } + break; + } + llvm_unreachable("Invalid base smrd opcode or size"); +} + +bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { + for (const GEPInfo &GEPInfo : AddrInfo) { + if (!GEPInfo.VgprParts.empty()) + return true; + } + return false; +} + +bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I, + ArrayRef<GEPInfo> AddrInfo) const { + + if (!I.hasOneMemOperand()) + return false; + + if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS) + return false; + + if (!isInstrUniform(I)) + return false; + + if (hasVgprParts(AddrInfo)) + return false; + + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned DstReg = I.getOperand(0).getReg(); + const DebugLoc &DL = I.getDebugLoc(); + unsigned Opcode; + unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); + + if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) { + + const GEPInfo &GEPInfo = AddrInfo[0]; + + unsigned PtrReg = GEPInfo.SgprParts[0]; + int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(Subtarget, GEPInfo.Imm); + if (AMDGPU::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) { + Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); + + MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) + .addReg(PtrReg) + .addImm(EncodedImm) + .addImm(0); // glc + return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); + } + + if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS && + isUInt<32>(EncodedImm)) { + Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize); + MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) + .addReg(PtrReg) + .addImm(EncodedImm) + .addImm(0); // glc + return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); + } + + if (isUInt<32>(GEPInfo.Imm)) { + Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize); + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(GEPInfo.Imm); + + MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) + .addReg(PtrReg) + .addReg(OffsetReg) + .addImm(0); // glc + return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); + } + } + + unsigned PtrReg = I.getOperand(1).getReg(); + Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); + MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) + .addReg(PtrReg) + .addImm(0) + .addImm(0); // glc + return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); +} + + +bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + DebugLoc DL = I.getDebugLoc(); + unsigned DstReg = I.getOperand(0).getReg(); + unsigned PtrReg = I.getOperand(1).getReg(); + unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); + unsigned Opcode; + + SmallVector<GEPInfo, 4> AddrInfo; + + getAddrModeInfo(I, MRI, AddrInfo); + + if (selectSMRD(I, AddrInfo)) { + I.eraseFromParent(); + return true; + } + + switch (LoadSize) { + default: + llvm_unreachable("Load size not supported\n"); + case 32: + Opcode = AMDGPU::FLAT_LOAD_DWORD; + break; + case 64: + Opcode = AMDGPU::FLAT_LOAD_DWORDX2; + break; + } + + MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode)) + .add(I.getOperand(0)) + .addReg(PtrReg) + .addImm(0) // offset + .addImm(0) // glc + .addImm(0); // slc + + bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); + I.eraseFromParent(); + return Ret; +} + +bool AMDGPUInstructionSelector::select(MachineInstr &I) const { + + if (!isPreISelGenericOpcode(I.getOpcode())) + return true; + + switch (I.getOpcode()) { + default: + break; + case TargetOpcode::G_ADD: + return selectG_ADD(I); + case TargetOpcode::G_CONSTANT: + return selectG_CONSTANT(I); + case TargetOpcode::G_GEP: + return selectG_GEP(I); + case TargetOpcode::G_LOAD: + return selectG_LOAD(I); + case TargetOpcode::G_STORE: + return selectG_STORE(I); + } + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h new file mode 100644 index 0000000..ef845f4 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -0,0 +1,67 @@ +//===- AMDGPUInstructionSelector --------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the InstructionSelector class for +/// AMDGPU. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H + +#include "AMDGPU.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" + +namespace llvm { + +class AMDGPUInstrInfo; +class AMDGPURegisterBankInfo; +class MachineInstr; +class MachineOperand; +class MachineRegisterInfo; +class SIInstrInfo; +class SIRegisterInfo; +class SISubtarget; + +class AMDGPUInstructionSelector : public InstructionSelector { +public: + AMDGPUInstructionSelector(const SISubtarget &STI, + const AMDGPURegisterBankInfo &RBI); + + bool select(MachineInstr &I) const override; +private: + struct GEPInfo { + const MachineInstr &GEP; + SmallVector<unsigned, 2> SgprParts; + SmallVector<unsigned, 2> VgprParts; + int64_t Imm; + GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { } + }; + + MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const; + bool selectG_CONSTANT(MachineInstr &I) const; + bool selectG_ADD(MachineInstr &I) const; + bool selectG_GEP(MachineInstr &I) const; + bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const; + void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI, + SmallVectorImpl<GEPInfo> &AddrInfo) const; + bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const; + bool selectG_LOAD(MachineInstr &I) const; + bool selectG_STORE(MachineInstr &I) const; + + const SIInstrInfo &TII; + const SIRegisterInfo &TRI; + const AMDGPURegisterBankInfo &RBI; +protected: + AMDGPUAS AMDGPUASI; +}; + +} // End llvm namespace. +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 59cba63..4e688ab 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -50,6 +50,16 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; +def u16ImmTarget : AsmOperandClass { + let Name = "U16Imm"; + let RenderMethod = "addImmOperands"; +} + +def s16ImmTarget : AsmOperandClass { + let Name = "S16Imm"; + let RenderMethod = "addImmOperands"; +} + let OperandType = "OPERAND_IMMEDIATE" in { def u32imm : Operand<i32> { @@ -58,6 +68,12 @@ def u32imm : Operand<i32> { def u16imm : Operand<i16> { let PrintMethod = "printU16ImmOperand"; + let ParserMatchClass = u16ImmTarget; +} + +def s16imm : Operand<i16> { + let PrintMethod = "printU16ImmOperand"; + let ParserMatchClass = s16ImmTarget; } def u8imm : Operand<i8> { @@ -72,6 +88,49 @@ def u8imm : Operand<i8> { def brtarget : Operand<OtherVT>; //===----------------------------------------------------------------------===// +// Misc. PatFrags +//===----------------------------------------------------------------------===// + +class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag< + (ops node:$src0), + (op $src0), + [{ return N->hasOneUse(); }] +>; + +class HasOneUseBinOp<SDPatternOperator op> : PatFrag< + (ops node:$src0, node:$src1), + (op $src0, $src1), + [{ return N->hasOneUse(); }] +>; + +class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag< + (ops node:$src0, node:$src1, node:$src2), + (op $src0, $src1, $src2), + [{ return N->hasOneUse(); }] +>; + +def trunc_oneuse : HasOneUseUnaryOp<trunc>; + +let Properties = [SDNPCommutative, SDNPAssociative] in { +def smax_oneuse : HasOneUseBinOp<smax>; +def smin_oneuse : HasOneUseBinOp<smin>; +def umax_oneuse : HasOneUseBinOp<umax>; +def umin_oneuse : HasOneUseBinOp<umin>; +def fminnum_oneuse : HasOneUseBinOp<fminnum>; +def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>; +def and_oneuse : HasOneUseBinOp<and>; +def or_oneuse : HasOneUseBinOp<or>; +def xor_oneuse : HasOneUseBinOp<xor>; +} // Properties = [SDNPCommutative, SDNPAssociative] + +def sub_oneuse : HasOneUseBinOp<sub>; + +def srl_oneuse : HasOneUseBinOp<srl>; +def shl_oneuse : HasOneUseBinOp<shl>; + +def select_oneuse : HasOneUseTernaryOp<select>; + +//===----------------------------------------------------------------------===// // PatLeafs for floating-point comparisons //===----------------------------------------------------------------------===// @@ -157,27 +216,11 @@ def COND_NULL : PatLeaf < //===----------------------------------------------------------------------===// -// Misc. PatFrags -//===----------------------------------------------------------------------===// - -class HasOneUseBinOp<SDPatternOperator op> : PatFrag< - (ops node:$src0, node:$src1), - (op $src0, $src1), - [{ return N->hasOneUse(); }] ->; - -class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag< - (ops node:$src0, node:$src1, node:$src2), - (op $src0, $src1, $src2), - [{ return N->hasOneUse(); }] ->; - -//===----------------------------------------------------------------------===// // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS; }]>; class PrivateLoad <SDPatternOperator op> : PrivateMemOp < @@ -195,7 +238,7 @@ def truncstorei16_private : PrivateStore <truncstorei16>; def store_private : PrivateStore <store>; class GlobalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; }]>; // Global address space loads @@ -215,7 +258,7 @@ def global_store_atomic : GlobalStore<atomic_store>; class ConstantMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS; }]>; // Constant address space loads @@ -226,7 +269,7 @@ class ConstantLoad <SDPatternOperator op> : ConstantMemOp < def constant_load : ConstantLoad<load>; class LocalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; // Local address space loads @@ -239,7 +282,7 @@ class LocalStore <SDPatternOperator op> : LocalMemOp < >; class FlatMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUAS::FLAT_ADDRESS; + return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUASI.FLAT_ADDRESS; }]>; class FlatLoad <SDPatternOperator op> : FlatMemOp < @@ -321,7 +364,7 @@ def local_store_aligned8bytes : Aligned8Bytes < class local_binary_atomic_op<SDNode atomic_op> : PatFrag<(ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; @@ -339,7 +382,7 @@ def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>; def mskor_global : PatFrag<(ops node:$val, node:$ptr), (AMDGPUstore_mskor node:$val, node:$ptr), [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; }]>; multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { @@ -349,7 +392,7 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ AtomicSDNode *AN = cast<AtomicSDNode>(N); return AN->getMemoryVT() == MVT::i32 && - AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; def _64_local : PatFrag< @@ -357,7 +400,7 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ AtomicSDNode *AN = cast<AtomicSDNode>(N); return AN->getMemoryVT() == MVT::i64 && - AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; } @@ -367,17 +410,17 @@ multiclass global_binary_atomic_op<SDNode atomic_op> { def "" : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>; def _noret : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; def _ret : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; } defm atomic_swap_global : global_binary_atomic_op<atomic_swap>; @@ -395,22 +438,22 @@ defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; def AMDGPUatomic_cmp_swap_global : PatFrag< (ops node:$ptr, node:$value), (AMDGPUatomic_cmp_swap node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>; def atomic_cmp_swap_global : PatFrag< (ops node:$ptr, node:$cmp, node:$value), (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>; def atomic_cmp_swap_global_noret : PatFrag< (ops node:$ptr, node:$cmp, node:$value), (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; def atomic_cmp_swap_global_ret : PatFrag< (ops node:$ptr, node:$cmp, node:$value), (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; //===----------------------------------------------------------------------===// // Misc Pattern Fragments @@ -422,6 +465,7 @@ int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding int FP16_ONE = 0x3C00; +int V2FP16_ONE = 0x3C003C00; int FP32_ONE = 0x3f800000; int FP32_NEG_ONE = 0xbf800000; int FP64_ONE = 0x3ff0000000000000; @@ -452,7 +496,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst < (outs rc:$dst), (ins rc:$src0), "CLAMP $dst, $src0", - [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] + [(set f32:$dst, (AMDGPUclamp f32:$src0))] >; class FABS <RegisterClass rc> : AMDGPUShaderInst < @@ -565,6 +609,12 @@ multiclass BFIPatterns <Instruction BFI_INT, >; def : Pat < + (f32 (fcopysign f32:$src0, f64:$src1)), + (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, + (i32 (EXTRACT_SUBREG $src1, sub1))) + >; + + def : Pat < (f64 (fcopysign f64:$src0, f64:$src1)), (REG_SEQUENCE RC64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, @@ -602,10 +652,22 @@ def IMMPopCount : SDNodeXForm<imm, [{ MVT::i32); }]>; -class BFEPattern <Instruction BFE, Instruction MOV> : Pat < - (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)), - (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) ->; +multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> { + def : Pat < + (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)), + (UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) + >; + + def : Pat < + (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)), + (UBFE $src, (i32 0), $width) + >; + + def : Pat < + (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)), + (SBFE $src, (i32 0), $width) + >; +} // rotr pattern class ROTRPattern <Instruction BIT_ALIGN> : Pat < @@ -618,23 +680,13 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat < class IntMed3Pat<Instruction med3Inst, SDPatternOperator max, SDPatternOperator max_oneuse, - SDPatternOperator min_oneuse> : Pat< - (max (min_oneuse i32:$src0, i32:$src1), - (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)), + SDPatternOperator min_oneuse, + ValueType vt = i32> : Pat< + (max (min_oneuse vt:$src0, vt:$src1), + (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)), (med3Inst $src0, $src1, $src2) >; -let Properties = [SDNPCommutative, SDNPAssociative] in { -def smax_oneuse : HasOneUseBinOp<smax>; -def smin_oneuse : HasOneUseBinOp<smin>; -def umax_oneuse : HasOneUseBinOp<umax>; -def umin_oneuse : HasOneUseBinOp<umin>; -} // Properties = [SDNPCommutative, SDNPAssociative] - -def sub_oneuse : HasOneUseBinOp<sub>; - -def select_oneuse : HasOneUseTernaryOp<select>; - // Special conversion patterns def cvt_rpi_i32_f32 : PatFrag < diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp index 8e3471b..86dc9bd 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -54,14 +54,7 @@ std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID, ArrayRef<Type*> Tys) const { // FIXME: Re-use Intrinsic::getType machinery - switch (ID) { - case AMDGPUIntrinsic::amdgcn_fdiv_fast: { - Type *F32Ty = Type::getFloatTy(Context); - return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false); - } - default: - llvm_unreachable("unhandled intrinsic"); - } + llvm_unreachable("unhandled intrinsic"); } unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData, @@ -97,8 +90,8 @@ Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, Function *F = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy)); - AttributeSet AS = getAttributes(M->getContext(), - static_cast<AMDGPUIntrinsic::ID>(IntrID)); + AttributeList AS = + getAttributes(M->getContext(), static_cast<AMDGPUIntrinsic::ID>(IntrID)); F->setAttributes(AS); return F; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td index ceae0b5..18c9bd9 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -12,25 +12,8 @@ //===----------------------------------------------------------------------===// let TargetPrefix = "AMDGPU", isTarget = 1 in { - def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; def int_AMDGPU_kilp : Intrinsic<[], [], []>; - - // Deprecated in favor of llvm.amdgcn.sffbh - def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - - // Deprecated in favor of separate int_amdgcn_cube* intrinsics. - def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - - // Deprecated in favor of expanded bit operations - def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - - // Deprecated in favor of llvm.amdgcn.rsq - def int_AMDGPU_rsq : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] - >; } include "SIIntrinsics.td" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp new file mode 100644 index 0000000..cc56216 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -0,0 +1,88 @@ +//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the Machinelegalizer class for +/// AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AMDGPULegalizerInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetOpcodes.h" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +AMDGPULegalizerInfo::AMDGPULegalizerInfo() { + using namespace TargetOpcode; + + const LLT S1= LLT::scalar(1); + const LLT V2S16 = LLT::vector(2, 16); + const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); + const LLT P1 = LLT::pointer(1, 64); + const LLT P2 = LLT::pointer(2, 64); + + setAction({G_ADD, S32}, Legal); + setAction({G_AND, S32}, Legal); + + setAction({G_BITCAST, V2S16}, Legal); + setAction({G_BITCAST, 1, S32}, Legal); + + setAction({G_BITCAST, S32}, Legal); + setAction({G_BITCAST, 1, V2S16}, Legal); + + // FIXME: i1 operands to intrinsics should always be legal, but other i1 + // values may not be legal. We need to figure out how to distinguish + // between these two scenarios. + setAction({G_CONSTANT, S1}, Legal); + setAction({G_CONSTANT, S32}, Legal); + setAction({G_CONSTANT, S64}, Legal); + + setAction({G_FCONSTANT, S32}, Legal); + + setAction({G_GEP, P1}, Legal); + setAction({G_GEP, P2}, Legal); + setAction({G_GEP, 1, S64}, Legal); + + setAction({G_ICMP, S1}, Legal); + setAction({G_ICMP, 1, S32}, Legal); + + setAction({G_LOAD, P1}, Legal); + setAction({G_LOAD, P2}, Legal); + setAction({G_LOAD, S32}, Legal); + setAction({G_LOAD, 1, P1}, Legal); + setAction({G_LOAD, 1, P2}, Legal); + + setAction({G_SELECT, S32}, Legal); + setAction({G_SELECT, 1, S1}, Legal); + + setAction({G_SHL, S32}, Legal); + + setAction({G_STORE, S32}, Legal); + setAction({G_STORE, 1, P1}, Legal); + + // FIXME: When RegBankSelect inserts copies, it will only create new + // registers with scalar types. This means we can end up with + // G_LOAD/G_STORE/G_GEP instruction with scalar types for their pointer + // operands. In assert builds, the instruction selector will assert + // if it sees a generic instruction which isn't legal, so we need to + // tell it that scalar types are legal for pointer operands + setAction({G_GEP, S64}, Legal); + setAction({G_LOAD, 1, S64}, Legal); + setAction({G_STORE, 1, S64}, Legal); + + computeTables(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h new file mode 100644 index 0000000..291e336 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -0,0 +1,30 @@ +//===- AMDGPULegalizerInfo ---------------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the Machinelegalizer class for +/// AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H + +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" + +namespace llvm { + +class LLVMContext; + +/// This class provides the information for the target register banks. +class AMDGPULegalizerInfo : public LegalizerInfo { +public: + AMDGPULegalizerInfo(); +}; +} // End llvm namespace. +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp new file mode 100644 index 0000000..7e0e980 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -0,0 +1,170 @@ +//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/LowerMemIntrinsics.h" + +#define DEBUG_TYPE "amdgpu-lower-intrinsics" + +using namespace llvm; + +namespace { + +const unsigned MaxStaticSize = 1024; + +class AMDGPULowerIntrinsics : public ModulePass { +private: + bool makeLIDRangeMetadata(Function &F) const; + +public: + static char ID; + + AMDGPULowerIntrinsics() : ModulePass(ID) {} + + bool runOnModule(Module &M) override; + bool expandMemIntrinsicUses(Function &F); + StringRef getPassName() const override { + return "AMDGPU Lower Intrinsics"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetTransformInfoWrapperPass>(); + } +}; + +} + +char AMDGPULowerIntrinsics::ID = 0; + +char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID; + +INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false, + false) + +// TODO: Should refine based on estimated number of accesses (e.g. does it +// require splitting based on alignment) +static bool shouldExpandOperationWithSize(Value *Size) { + ConstantInt *CI = dyn_cast<ConstantInt>(Size); + return !CI || (CI->getZExtValue() > MaxStaticSize); +} + +bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) { + Intrinsic::ID ID = F.getIntrinsicID(); + bool Changed = false; + + for (auto I = F.user_begin(), E = F.user_end(); I != E;) { + Instruction *Inst = cast<Instruction>(*I); + ++I; + + switch (ID) { + case Intrinsic::memcpy: { + auto *Memcpy = cast<MemCpyInst>(Inst); + if (shouldExpandOperationWithSize(Memcpy->getLength())) { + Function *ParentFunc = Memcpy->getParent()->getParent(); + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc); + expandMemCpyAsLoop(Memcpy, TTI); + Changed = true; + Memcpy->eraseFromParent(); + } + + break; + } + case Intrinsic::memmove: { + auto *Memmove = cast<MemMoveInst>(Inst); + if (shouldExpandOperationWithSize(Memmove->getLength())) { + expandMemMoveAsLoop(Memmove); + Changed = true; + Memmove->eraseFromParent(); + } + + break; + } + case Intrinsic::memset: { + auto *Memset = cast<MemSetInst>(Inst); + if (shouldExpandOperationWithSize(Memset->getLength())) { + expandMemSetAsLoop(Memset); + Changed = true; + Memset->eraseFromParent(); + } + + break; + } + default: + break; + } + } + + return Changed; +} + +bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const { + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + const TargetMachine &TM = TPC->getTM<TargetMachine>(); + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(F); + bool Changed = false; + + for (auto *U : F.users()) { + auto *CI = dyn_cast<CallInst>(U); + if (!CI) + continue; + + Changed |= ST.makeLIDRangeMetadata(CI); + } + return Changed; +} + +bool AMDGPULowerIntrinsics::runOnModule(Module &M) { + bool Changed = false; + + for (Function &F : M) { + if (!F.isDeclaration()) + continue; + + switch (F.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + if (expandMemIntrinsicUses(F)) + Changed = true; + break; + + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::r600_read_tidig_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + case Intrinsic::r600_read_local_size_x: + case Intrinsic::r600_read_local_size_y: + case Intrinsic::r600_read_local_size_z: + Changed |= makeLIDRangeMetadata(F); + break; + + default: + break; + } + } + + return Changed; +} + +ModulePass *llvm::createAMDGPULowerIntrinsicsPass() { + return new AMDGPULowerIntrinsics(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 7d56355..63dd0d7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -38,7 +38,6 @@ using namespace llvm; #include "AMDGPUGenMCPseudoLowering.inc" - AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st, const AsmPrinter &ap): Ctx(ctx), ST(st), AP(ap) { } @@ -126,9 +125,15 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, } void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { + unsigned Opcode = MI->getOpcode(); - int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); + // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We + // need to select it to the subtarget specific version, and there's no way to + // do that with a single pseudo source operation. + if (Opcode == AMDGPU::S_SETPC_B64_return) + Opcode = AMDGPU::S_SETPC_B64; + int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode); if (MCOpcode == -1) { LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " @@ -151,6 +156,28 @@ bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO, return MCInstLowering.lowerOperand(MO, MCOp); } +const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) { + // TargetMachine does not support llvm-style cast. Use C++-style cast. + // This is safe since TM is always of type AMDGPUTargetMachine or its + // derived class. + auto *AT = static_cast<AMDGPUTargetMachine*>(&TM); + auto *CE = dyn_cast<ConstantExpr>(CV); + + // Lower null pointers in private and local address space. + // Clang generates addrspacecast for null pointers in private and local + // address space, which needs to be lowered. + if (CE && CE->getOpcode() == Instruction::AddrSpaceCast) { + auto Op = CE->getOperand(0); + auto SrcAddr = Op->getType()->getPointerAddressSpace(); + if (Op->isNullValue() && AT->getNullPointerValue(SrcAddr) == 0) { + auto DstAddr = CE->getType()->getPointerAddressSpace(); + return MCConstantExpr::create(AT->getNullPointerValue(DstAddr), + OutContext); + } + } + return AsmPrinter::lowerConstant(CV); +} + void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; @@ -162,7 +189,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) { LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); C.emitError("Illegal instruction detected: " + Err); - MI->dump(); + MI->print(errs()); } if (MI->isBundle()) { @@ -173,8 +200,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { ++I; } } else { - // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder - // terminator instructions and should only be printed as comments. + // We don't want SI_MASK_BRANCH/SI_RETURN_TO_EPILOG encoded. They are + // placeholder terminator instructions and should only be printed as + // comments. if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) { if (isVerbose()) { SmallVector<char, 16> BBStr; @@ -190,9 +218,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } - if (MI->getOpcode() == AMDGPU::SI_RETURN) { + if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { if (isVerbose()) - OutStreamer->emitRawComment(" return"); + OutStreamer->emitRawComment(" return to shader part epilog"); return; } @@ -202,6 +230,12 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } + if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) { + if (isVerbose()) + OutStreamer->emitRawComment(" divergent unreachable"); + return; + } + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp new file mode 100644 index 0000000..9a391d0 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -0,0 +1,2881 @@ +//===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the machine instruction level CFG structurizer pass. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegionInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <tuple> +using namespace llvm; + +#define DEBUG_TYPE "amdgpucfgstructurizer" + +namespace { +class PHILinearizeDestIterator; + +class PHILinearize { + friend class PHILinearizeDestIterator; + +public: + typedef std::pair<unsigned, MachineBasicBlock *> PHISourceT; + +private: + typedef DenseSet<PHISourceT> PHISourcesT; + typedef struct { + unsigned DestReg; + DebugLoc DL; + PHISourcesT Sources; + } PHIInfoElementT; + typedef SmallPtrSet<PHIInfoElementT *, 2> PHIInfoT; + PHIInfoT PHIInfo; + + static unsigned phiInfoElementGetDest(PHIInfoElementT *Info); + static void phiInfoElementSetDef(PHIInfoElementT *Info, unsigned NewDef); + static PHISourcesT &phiInfoElementGetSources(PHIInfoElementT *Info); + static void phiInfoElementAddSource(PHIInfoElementT *Info, unsigned SourceReg, + MachineBasicBlock *SourceMBB); + static void phiInfoElementRemoveSource(PHIInfoElementT *Info, + unsigned SourceReg, + MachineBasicBlock *SourceMBB); + PHIInfoElementT *findPHIInfoElement(unsigned DestReg); + PHIInfoElementT *findPHIInfoElementFromSource(unsigned SourceReg, + MachineBasicBlock *SourceMBB); + +public: + bool findSourcesFromMBB(MachineBasicBlock *SourceMBB, + SmallVector<unsigned, 4> &Sources); + void addDest(unsigned DestReg, const DebugLoc &DL); + void replaceDef(unsigned OldDestReg, unsigned NewDestReg); + void deleteDef(unsigned DestReg); + void addSource(unsigned DestReg, unsigned SourceReg, + MachineBasicBlock *SourceMBB); + void removeSource(unsigned DestReg, unsigned SourceReg, + MachineBasicBlock *SourceMBB = nullptr); + bool findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB, + unsigned &DestReg); + bool isSource(unsigned Reg, MachineBasicBlock *SourceMBB = nullptr); + unsigned getNumSources(unsigned DestReg); + void dump(MachineRegisterInfo *MRI); + void clear(); + + typedef PHISourcesT::iterator source_iterator; + typedef PHILinearizeDestIterator dest_iterator; + + dest_iterator dests_begin(); + dest_iterator dests_end(); + + source_iterator sources_begin(unsigned Reg); + source_iterator sources_end(unsigned Reg); +}; + +class PHILinearizeDestIterator { +private: + PHILinearize::PHIInfoT::iterator Iter; + +public: + unsigned operator*() { return PHILinearize::phiInfoElementGetDest(*Iter); } + PHILinearizeDestIterator &operator++() { + ++Iter; + return *this; + } + bool operator==(const PHILinearizeDestIterator &I) const { + return I.Iter == Iter; + } + bool operator!=(const PHILinearizeDestIterator &I) const { + return I.Iter != Iter; + } + + PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {} +}; + +unsigned PHILinearize::phiInfoElementGetDest(PHIInfoElementT *Info) { + return Info->DestReg; +} + +void PHILinearize::phiInfoElementSetDef(PHIInfoElementT *Info, + unsigned NewDef) { + Info->DestReg = NewDef; +} + +PHILinearize::PHISourcesT & +PHILinearize::phiInfoElementGetSources(PHIInfoElementT *Info) { + return Info->Sources; +} + +void PHILinearize::phiInfoElementAddSource(PHIInfoElementT *Info, + unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + // Assertion ensures we don't use the same SourceMBB for the + // sources, because we cannot have different registers with + // identical predecessors, but we can have the same register for + // multiple predecessors. +#if !defined(NDEBUG) + for (auto SI : phiInfoElementGetSources(Info)) { + assert((SI.second != SourceMBB || SourceReg == SI.first)); + } +#endif + + phiInfoElementGetSources(Info).insert(PHISourceT(SourceReg, SourceMBB)); +} + +void PHILinearize::phiInfoElementRemoveSource(PHIInfoElementT *Info, + unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + auto &Sources = phiInfoElementGetSources(Info); + SmallVector<PHISourceT, 4> ElimiatedSources; + for (auto SI : Sources) { + if (SI.first == SourceReg && + (SI.second == nullptr || SI.second == SourceMBB)) { + ElimiatedSources.push_back(PHISourceT(SI.first, SI.second)); + } + } + + for (auto &Source : ElimiatedSources) { + Sources.erase(Source); + } +} + +PHILinearize::PHIInfoElementT * +PHILinearize::findPHIInfoElement(unsigned DestReg) { + for (auto I : PHIInfo) { + if (phiInfoElementGetDest(I) == DestReg) { + return I; + } + } + return nullptr; +} + +PHILinearize::PHIInfoElementT * +PHILinearize::findPHIInfoElementFromSource(unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + for (auto I : PHIInfo) { + for (auto SI : phiInfoElementGetSources(I)) { + if (SI.first == SourceReg && + (SI.second == nullptr || SI.second == SourceMBB)) { + return I; + } + } + } + return nullptr; +} + +bool PHILinearize::findSourcesFromMBB(MachineBasicBlock *SourceMBB, + SmallVector<unsigned, 4> &Sources) { + bool FoundSource = false; + for (auto I : PHIInfo) { + for (auto SI : phiInfoElementGetSources(I)) { + if (SI.second == SourceMBB) { + FoundSource = true; + Sources.push_back(SI.first); + } + } + } + return FoundSource; +} + +void PHILinearize::addDest(unsigned DestReg, const DebugLoc &DL) { + assert(findPHIInfoElement(DestReg) == nullptr && "Dest already exsists"); + PHISourcesT EmptySet; + PHIInfoElementT *NewElement = new PHIInfoElementT(); + NewElement->DestReg = DestReg; + NewElement->DL = DL; + NewElement->Sources = EmptySet; + PHIInfo.insert(NewElement); +} + +void PHILinearize::replaceDef(unsigned OldDestReg, unsigned NewDestReg) { + phiInfoElementSetDef(findPHIInfoElement(OldDestReg), NewDestReg); +} + +void PHILinearize::deleteDef(unsigned DestReg) { + PHIInfoElementT *InfoElement = findPHIInfoElement(DestReg); + PHIInfo.erase(InfoElement); + delete InfoElement; +} + +void PHILinearize::addSource(unsigned DestReg, unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + phiInfoElementAddSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB); +} + +void PHILinearize::removeSource(unsigned DestReg, unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + phiInfoElementRemoveSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB); +} + +bool PHILinearize::findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB, + unsigned &DestReg) { + PHIInfoElementT *InfoElement = + findPHIInfoElementFromSource(SourceReg, SourceMBB); + if (InfoElement != nullptr) { + DestReg = phiInfoElementGetDest(InfoElement); + return true; + } + return false; +} + +bool PHILinearize::isSource(unsigned Reg, MachineBasicBlock *SourceMBB) { + unsigned DestReg; + return findDest(Reg, SourceMBB, DestReg); +} + +unsigned PHILinearize::getNumSources(unsigned DestReg) { + return phiInfoElementGetSources(findPHIInfoElement(DestReg)).size(); +} + +void PHILinearize::dump(MachineRegisterInfo *MRI) { + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + dbgs() << "=PHIInfo Start=\n"; + for (auto PII : this->PHIInfo) { + PHIInfoElementT &Element = *PII; + dbgs() << "Dest: " << PrintReg(Element.DestReg, TRI) + << " Sources: {"; + for (auto &SI : Element.Sources) { + dbgs() << PrintReg(SI.first, TRI) << "(BB#" + << SI.second->getNumber() << "),"; + } + dbgs() << "}\n"; + } + dbgs() << "=PHIInfo End=\n"; +} + +void PHILinearize::clear() { PHIInfo = PHIInfoT(); } + +PHILinearize::dest_iterator PHILinearize::dests_begin() { + return PHILinearizeDestIterator(PHIInfo.begin()); +} + +PHILinearize::dest_iterator PHILinearize::dests_end() { + return PHILinearizeDestIterator(PHIInfo.end()); +} + +PHILinearize::source_iterator PHILinearize::sources_begin(unsigned Reg) { + auto InfoElement = findPHIInfoElement(Reg); + return phiInfoElementGetSources(InfoElement).begin(); +} +PHILinearize::source_iterator PHILinearize::sources_end(unsigned Reg) { + auto InfoElement = findPHIInfoElement(Reg); + return phiInfoElementGetSources(InfoElement).end(); +} + +class RegionMRT; +class MBBMRT; + +static unsigned getPHINumInputs(MachineInstr &PHI) { + assert(PHI.isPHI()); + return (PHI.getNumOperands() - 1) / 2; +} + +static MachineBasicBlock *getPHIPred(MachineInstr &PHI, unsigned Index) { + assert(PHI.isPHI()); + return PHI.getOperand(Index * 2 + 2).getMBB(); +} + +static void setPhiPred(MachineInstr &PHI, unsigned Index, + MachineBasicBlock *NewPred) { + PHI.getOperand(Index * 2 + 2).setMBB(NewPred); +} + +static unsigned getPHISourceReg(MachineInstr &PHI, unsigned Index) { + assert(PHI.isPHI()); + return PHI.getOperand(Index * 2 + 1).getReg(); +} + +static unsigned getPHIDestReg(MachineInstr &PHI) { + assert(PHI.isPHI()); + return PHI.getOperand(0).getReg(); +} + +class LinearizedRegion { +protected: + MachineBasicBlock *Entry; + // The exit block is part of the region, and is the last + // merge block before exiting the region. + MachineBasicBlock *Exit; + DenseSet<unsigned> LiveOuts; + SmallPtrSet<MachineBasicBlock *, 1> MBBs; + bool HasLoop; + LinearizedRegion *Parent; + RegionMRT *RMRT; + + void storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg, + MachineInstr *DefInstr, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + + void storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg, + MachineInstr *DefInstr, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo); + + void storeMBBLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo, + RegionMRT *TopRegion); + + void storeLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + + void storeLiveOuts(RegionMRT *Region, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo, + RegionMRT *TopRegion = nullptr); + +public: + void setRegionMRT(RegionMRT *Region) { RMRT = Region; } + + RegionMRT *getRegionMRT() { return RMRT; } + + void setParent(LinearizedRegion *P) { Parent = P; } + + LinearizedRegion *getParent() { return Parent; } + + void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr); + + void setBBSelectRegIn(unsigned Reg); + + unsigned getBBSelectRegIn(); + + void setBBSelectRegOut(unsigned Reg, bool IsLiveOut); + + unsigned getBBSelectRegOut(); + + void setHasLoop(bool Value); + + bool getHasLoop(); + + void addLiveOut(unsigned VReg); + + void removeLiveOut(unsigned Reg); + + void replaceLiveOut(unsigned OldReg, unsigned NewReg); + + void replaceRegister(unsigned Register, unsigned NewRegister, + MachineRegisterInfo *MRI, bool ReplaceInside, + bool ReplaceOutside, bool IncludeLoopPHIs); + + void replaceRegisterInsideRegion(unsigned Register, unsigned NewRegister, + bool IncludeLoopPHIs, + MachineRegisterInfo *MRI); + + void replaceRegisterOutsideRegion(unsigned Register, unsigned NewRegister, + bool IncludeLoopPHIs, + MachineRegisterInfo *MRI); + + DenseSet<unsigned> *getLiveOuts(); + + void setEntry(MachineBasicBlock *NewEntry); + + MachineBasicBlock *getEntry(); + + void setExit(MachineBasicBlock *NewExit); + + MachineBasicBlock *getExit(); + + void addMBB(MachineBasicBlock *MBB); + + void addMBBs(LinearizedRegion *InnerRegion); + + bool contains(MachineBasicBlock *MBB); + + bool isLiveOut(unsigned Reg); + + bool hasNoDef(unsigned Reg, MachineRegisterInfo *MRI); + + void removeFalseRegisterKills(MachineRegisterInfo *MRI); + + void initLiveOut(RegionMRT *Region, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + + LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + + LinearizedRegion(); + + ~LinearizedRegion(); +}; + +class MRT { +protected: + RegionMRT *Parent; + unsigned BBSelectRegIn; + unsigned BBSelectRegOut; + +public: + unsigned getBBSelectRegIn() { return BBSelectRegIn; } + + unsigned getBBSelectRegOut() { return BBSelectRegOut; } + + void setBBSelectRegIn(unsigned Reg) { BBSelectRegIn = Reg; } + + void setBBSelectRegOut(unsigned Reg) { BBSelectRegOut = Reg; } + + virtual RegionMRT *getRegionMRT() { return nullptr; } + + virtual MBBMRT *getMBBMRT() { return nullptr; } + + bool isRegion() { return getRegionMRT() != nullptr; } + + bool isMBB() { return getMBBMRT() != nullptr; } + + bool isRoot() { return Parent == nullptr; } + + void setParent(RegionMRT *Region) { Parent = Region; } + + RegionMRT *getParent() { return Parent; } + + static MachineBasicBlock * + initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo, + DenseMap<MachineRegion *, RegionMRT *> &RegionMap); + + static RegionMRT *buildMRT(MachineFunction &MF, + const MachineRegionInfo *RegionInfo, + const SIInstrInfo *TII, + MachineRegisterInfo *MRI); + + virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) = 0; + + void dumpDepth(int depth) { + for (int i = depth; i > 0; --i) { + dbgs() << " "; + } + } + + virtual ~MRT() {} +}; + +class MBBMRT : public MRT { + MachineBasicBlock *MBB; + +public: + virtual MBBMRT *getMBBMRT() { return this; } + + MachineBasicBlock *getMBB() { return MBB; } + + virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) { + dumpDepth(depth); + dbgs() << "MBB: " << getMBB()->getNumber(); + dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI); + dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n"; + } + + MBBMRT(MachineBasicBlock *BB) : MBB(BB) { + setParent(nullptr); + setBBSelectRegOut(0); + setBBSelectRegIn(0); + } +}; + +class RegionMRT : public MRT { +protected: + MachineRegion *Region; + LinearizedRegion *LRegion; + MachineBasicBlock *Succ; + + SetVector<MRT *> Children; + +public: + virtual RegionMRT *getRegionMRT() { return this; } + + void setLinearizedRegion(LinearizedRegion *LinearizeRegion) { + LRegion = LinearizeRegion; + } + + LinearizedRegion *getLinearizedRegion() { return LRegion; } + + MachineRegion *getMachineRegion() { return Region; } + + unsigned getInnerOutputRegister() { + return (*(Children.begin()))->getBBSelectRegOut(); + } + + void addChild(MRT *Tree) { Children.insert(Tree); } + + SetVector<MRT *> *getChildren() { return &Children; } + + virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) { + dumpDepth(depth); + dbgs() << "Region: " << (void *)Region; + dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI); + dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n"; + + dumpDepth(depth); + if (getSucc()) + dbgs() << "Succ: " << getSucc()->getNumber() << "\n"; + else + dbgs() << "Succ: none \n"; + for (auto MRTI : Children) { + MRTI->dump(TRI, depth + 1); + } + } + + MRT *getEntryTree() { return Children.back(); } + + MRT *getExitTree() { return Children.front(); } + + MachineBasicBlock *getEntry() { + MRT *Tree = Children.back(); + return (Tree->isRegion()) ? Tree->getRegionMRT()->getEntry() + : Tree->getMBBMRT()->getMBB(); + } + + MachineBasicBlock *getExit() { + MRT *Tree = Children.front(); + return (Tree->isRegion()) ? Tree->getRegionMRT()->getExit() + : Tree->getMBBMRT()->getMBB(); + } + + void setSucc(MachineBasicBlock *MBB) { Succ = MBB; } + + MachineBasicBlock *getSucc() { return Succ; } + + bool contains(MachineBasicBlock *MBB) { + for (auto CI : Children) { + if (CI->isMBB()) { + if (MBB == CI->getMBBMRT()->getMBB()) { + return true; + } + } else { + if (CI->getRegionMRT()->contains(MBB)) { + return true; + } else if (CI->getRegionMRT()->getLinearizedRegion() != nullptr && + CI->getRegionMRT()->getLinearizedRegion()->contains(MBB)) { + return true; + } + } + } + return false; + } + + void replaceLiveOutReg(unsigned Register, unsigned NewRegister) { + LinearizedRegion *LRegion = getLinearizedRegion(); + LRegion->replaceLiveOut(Register, NewRegister); + for (auto &CI : Children) { + if (CI->isRegion()) { + CI->getRegionMRT()->replaceLiveOutReg(Register, NewRegister); + } + } + } + + RegionMRT(MachineRegion *MachineRegion) + : Region(MachineRegion), LRegion(nullptr), Succ(nullptr) { + setParent(nullptr); + setBBSelectRegOut(0); + setBBSelectRegIn(0); + } + + virtual ~RegionMRT() { + if (LRegion) { + delete LRegion; + } + + for (auto CI : Children) { + delete &(*CI); + } + } +}; + +static unsigned createBBSelectReg(const SIInstrInfo *TII, + MachineRegisterInfo *MRI) { + return MRI->createVirtualRegister(TII->getPreferredSelectRegClass(32)); +} + +MachineBasicBlock * +MRT::initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo, + DenseMap<MachineRegion *, RegionMRT *> &RegionMap) { + for (auto &MFI : MF) { + MachineBasicBlock *ExitMBB = &MFI; + if (ExitMBB->succ_size() == 0) { + return ExitMBB; + } + } + llvm_unreachable("CFG has no exit block"); + return nullptr; +} + +RegionMRT *MRT::buildMRT(MachineFunction &MF, + const MachineRegionInfo *RegionInfo, + const SIInstrInfo *TII, MachineRegisterInfo *MRI) { + SmallPtrSet<MachineRegion *, 4> PlacedRegions; + DenseMap<MachineRegion *, RegionMRT *> RegionMap; + MachineRegion *TopLevelRegion = RegionInfo->getTopLevelRegion(); + RegionMRT *Result = new RegionMRT(TopLevelRegion); + RegionMap[TopLevelRegion] = Result; + + // Insert the exit block first, we need it to be the merge node + // for the top level region. + MachineBasicBlock *Exit = initializeMRT(MF, RegionInfo, RegionMap); + + unsigned BBSelectRegIn = createBBSelectReg(TII, MRI); + MBBMRT *ExitMRT = new MBBMRT(Exit); + RegionMap[RegionInfo->getRegionFor(Exit)]->addChild(ExitMRT); + ExitMRT->setBBSelectRegIn(BBSelectRegIn); + + for (auto MBBI : post_order(&(MF.front()))) { + MachineBasicBlock *MBB = &(*MBBI); + + // Skip Exit since we already added it + if (MBB == Exit) { + continue; + } + + DEBUG(dbgs() << "Visiting BB#" << MBB->getNumber() << "\n"); + MBBMRT *NewMBB = new MBBMRT(MBB); + MachineRegion *Region = RegionInfo->getRegionFor(MBB); + + // Ensure we have the MRT region + if (RegionMap.count(Region) == 0) { + RegionMRT *NewMRTRegion = new RegionMRT(Region); + RegionMap[Region] = NewMRTRegion; + + // Ensure all parents are in the RegionMap + MachineRegion *Parent = Region->getParent(); + while (RegionMap.count(Parent) == 0) { + RegionMRT *NewMRTParent = new RegionMRT(Parent); + NewMRTParent->addChild(NewMRTRegion); + NewMRTRegion->setParent(NewMRTParent); + RegionMap[Parent] = NewMRTParent; + NewMRTRegion = NewMRTParent; + Parent = Parent->getParent(); + } + RegionMap[Parent]->addChild(NewMRTRegion); + NewMRTRegion->setParent(RegionMap[Parent]); + } + + // Add MBB to Region MRT + RegionMap[Region]->addChild(NewMBB); + NewMBB->setParent(RegionMap[Region]); + RegionMap[Region]->setSucc(Region->getExit()); + } + return Result; +} + +void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg, + MachineInstr *DefInstr, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + if (TRI->isVirtualRegister(Reg)) { + DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n"); + // If this is a source register to a PHI we are chaining, it + // must be live out. + if (PHIInfo.isSource(Reg)) { + DEBUG(dbgs() << "Add LiveOut (PHI): " << PrintReg(Reg, TRI) << "\n"); + addLiveOut(Reg); + } else { + // If this is live out of the MBB + for (auto &UI : MRI->use_operands(Reg)) { + if (UI.getParent()->getParent() != MBB) { + DEBUG(dbgs() << "Add LiveOut (MBB BB#" << MBB->getNumber() + << "): " << PrintReg(Reg, TRI) << "\n"); + addLiveOut(Reg); + } else { + // If the use is in the same MBB we have to make sure + // it is after the def, otherwise it is live out in a loop + MachineInstr *UseInstr = UI.getParent(); + for (MachineBasicBlock::instr_iterator + MII = UseInstr->getIterator(), + MIE = UseInstr->getParent()->instr_end(); + MII != MIE; ++MII) { + if ((&(*MII)) == DefInstr) { + DEBUG(dbgs() << "Add LiveOut (Loop): " << PrintReg(Reg, TRI) + << "\n"); + addLiveOut(Reg); + } + } + } + } + } + } +} + +void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg, + MachineInstr *DefInstr, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + if (TRI->isVirtualRegister(Reg)) { + DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n"); + for (auto &UI : MRI->use_operands(Reg)) { + if (!Region->contains(UI.getParent()->getParent())) { + DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region + << "): " << PrintReg(Reg, TRI) << "\n"); + addLiveOut(Reg); + } + } + } +} + +void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + DEBUG(dbgs() << "-Store Live Outs Begin (BB#" << MBB->getNumber() << ")-\n"); + for (auto &II : *MBB) { + for (auto &RI : II.defs()) { + storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo); + } + for (auto &IRI : II.implicit_operands()) { + if (IRI.isDef()) { + storeLiveOutReg(MBB, IRI.getReg(), IRI.getParent(), MRI, TRI, PHIInfo); + } + } + } + + // If we have a successor with a PHI, source coming from this MBB we have to + // add the register as live out + for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), + E = MBB->succ_end(); + SI != E; ++SI) { + for (auto &II : *(*SI)) { + if (II.isPHI()) { + MachineInstr &PHI = II; + int numPreds = getPHINumInputs(PHI); + for (int i = 0; i < numPreds; ++i) { + if (getPHIPred(PHI, i) == MBB) { + unsigned PHIReg = getPHISourceReg(PHI, i); + DEBUG(dbgs() << "Add LiveOut (PhiSource BB#" << MBB->getNumber() + << " -> BB#" << (*SI)->getNumber() + << "): " << PrintReg(PHIReg, TRI) << "\n"); + addLiveOut(PHIReg); + } + } + } + } + } + + DEBUG(dbgs() << "-Store Live Outs Endn-\n"); +} + +void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo, + RegionMRT *TopRegion) { + for (auto &II : *MBB) { + for (auto &RI : II.defs()) { + storeLiveOutRegRegion(TopRegion, RI.getReg(), RI.getParent(), MRI, TRI, + PHIInfo); + } + for (auto &IRI : II.implicit_operands()) { + if (IRI.isDef()) { + storeLiveOutRegRegion(TopRegion, IRI.getReg(), IRI.getParent(), MRI, + TRI, PHIInfo); + } + } + } +} + +void LinearizedRegion::storeLiveOuts(RegionMRT *Region, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo, + RegionMRT *CurrentTopRegion) { + MachineBasicBlock *Exit = Region->getSucc(); + + RegionMRT *TopRegion = + CurrentTopRegion == nullptr ? Region : CurrentTopRegion; + + // Check if exit is end of function, if so, no live outs. + if (Exit == nullptr) + return; + + auto Children = Region->getChildren(); + for (auto CI : *Children) { + if (CI->isMBB()) { + auto MBB = CI->getMBBMRT()->getMBB(); + storeMBBLiveOuts(MBB, MRI, TRI, PHIInfo, TopRegion); + } else { + LinearizedRegion *SubRegion = CI->getRegionMRT()->getLinearizedRegion(); + // We should be limited to only store registers that are live out from the + // lineaized region + for (auto MBBI : SubRegion->MBBs) { + storeMBBLiveOuts(MBBI, MRI, TRI, PHIInfo, TopRegion); + } + } + } + + if (CurrentTopRegion == nullptr) { + auto Succ = Region->getSucc(); + for (auto &II : *Succ) { + if (II.isPHI()) { + MachineInstr &PHI = II; + int numPreds = getPHINumInputs(PHI); + for (int i = 0; i < numPreds; ++i) { + if (Region->contains(getPHIPred(PHI, i))) { + unsigned PHIReg = getPHISourceReg(PHI, i); + DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region + << "): " << PrintReg(PHIReg, TRI) << "\n"); + addLiveOut(PHIReg); + } + } + } + } + } +} + +void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) { + OS << "Linearized Region {"; + bool IsFirst = true; + for (const auto &MBB : MBBs) { + if (IsFirst) { + IsFirst = false; + } else { + OS << " ,"; + } + OS << MBB->getNumber(); + } + OS << "} (" << Entry->getNumber() << ", " + << (Exit == nullptr ? -1 : Exit->getNumber()) + << "): In:" << PrintReg(getBBSelectRegIn(), TRI) + << " Out:" << PrintReg(getBBSelectRegOut(), TRI) << " {"; + for (auto &LI : LiveOuts) { + OS << PrintReg(LI, TRI) << " "; + } + OS << "} \n"; +} + +unsigned LinearizedRegion::getBBSelectRegIn() { + return getRegionMRT()->getBBSelectRegIn(); +} + +unsigned LinearizedRegion::getBBSelectRegOut() { + return getRegionMRT()->getBBSelectRegOut(); +} + +void LinearizedRegion::setHasLoop(bool Value) { HasLoop = Value; } + +bool LinearizedRegion::getHasLoop() { return HasLoop; } + +void LinearizedRegion::addLiveOut(unsigned VReg) { LiveOuts.insert(VReg); } + +void LinearizedRegion::removeLiveOut(unsigned Reg) { + if (isLiveOut(Reg)) + LiveOuts.erase(Reg); +} + +void LinearizedRegion::replaceLiveOut(unsigned OldReg, unsigned NewReg) { + if (isLiveOut(OldReg)) { + removeLiveOut(OldReg); + addLiveOut(NewReg); + } +} + +void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister, + MachineRegisterInfo *MRI, + bool ReplaceInside, bool ReplaceOutside, + bool IncludeLoopPHI) { + assert(Register != NewRegister && "Cannot replace a reg with itself"); + + DEBUG(dbgs() << "Pepareing to replace register (region): " + << PrintReg(Register, MRI->getTargetRegisterInfo()) << " with " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); + + // If we are replacing outside, we also need to update the LiveOuts + if (ReplaceOutside && + (isLiveOut(Register) || this->getParent()->isLiveOut(Register))) { + LinearizedRegion *Current = this; + while (Current != nullptr && Current->getEntry() != nullptr) { + DEBUG(dbgs() << "Region before register replace\n"); + DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo())); + Current->replaceLiveOut(Register, NewRegister); + DEBUG(dbgs() << "Region after register replace\n"); + DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo())); + Current = Current->getParent(); + } + } + + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register), + E = MRI->reg_end(); + I != E;) { + MachineOperand &O = *I; + ++I; + + // We don't rewrite defs. + if (O.isDef()) + continue; + + bool IsInside = contains(O.getParent()->getParent()); + bool IsLoopPHI = IsInside && (O.getParent()->isPHI() && + O.getParent()->getParent() == getEntry()); + bool ShouldReplace = (IsInside && ReplaceInside) || + (!IsInside && ReplaceOutside) || + (IncludeLoopPHI && IsLoopPHI); + if (ShouldReplace) { + + if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { + DEBUG(dbgs() << "Trying to substitute physical register: " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); + llvm_unreachable("Cannot substitute physical registers"); + } else { + DEBUG(dbgs() << "Replacing register (region): " + << PrintReg(Register, MRI->getTargetRegisterInfo()) + << " with " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); + O.setReg(NewRegister); + } + } + } +} + +void LinearizedRegion::replaceRegisterInsideRegion(unsigned Register, + unsigned NewRegister, + bool IncludeLoopPHIs, + MachineRegisterInfo *MRI) { + replaceRegister(Register, NewRegister, MRI, true, false, IncludeLoopPHIs); +} + +void LinearizedRegion::replaceRegisterOutsideRegion(unsigned Register, + unsigned NewRegister, + bool IncludeLoopPHIs, + MachineRegisterInfo *MRI) { + replaceRegister(Register, NewRegister, MRI, false, true, IncludeLoopPHIs); +} + +DenseSet<unsigned> *LinearizedRegion::getLiveOuts() { return &LiveOuts; } + +void LinearizedRegion::setEntry(MachineBasicBlock *NewEntry) { + Entry = NewEntry; +} + +MachineBasicBlock *LinearizedRegion::getEntry() { return Entry; } + +void LinearizedRegion::setExit(MachineBasicBlock *NewExit) { Exit = NewExit; } + +MachineBasicBlock *LinearizedRegion::getExit() { return Exit; } + +void LinearizedRegion::addMBB(MachineBasicBlock *MBB) { MBBs.insert(MBB); } + +void LinearizedRegion::addMBBs(LinearizedRegion *InnerRegion) { + for (const auto &MBB : InnerRegion->MBBs) { + addMBB(MBB); + } +} + +bool LinearizedRegion::contains(MachineBasicBlock *MBB) { + return MBBs.count(MBB) == 1; +} + +bool LinearizedRegion::isLiveOut(unsigned Reg) { + return LiveOuts.count(Reg) == 1; +} + +bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) { + return MRI->def_begin(Reg) == MRI->def_end(); +} + +// After the code has been structurized, what was flagged as kills +// before are no longer register kills. +void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) { + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + for (auto MBBI : MBBs) { + MachineBasicBlock *MBB = MBBI; + for (auto &II : *MBB) { + for (auto &RI : II.uses()) { + if (RI.isReg()) { + unsigned Reg = RI.getReg(); + if (TRI->isVirtualRegister(Reg)) { + if (hasNoDef(Reg, MRI)) + continue; + if (!MRI->hasOneDef(Reg)) { + DEBUG(this->getEntry()->getParent()->dump()); + DEBUG(dbgs() << PrintReg(Reg, TRI) << "\n"); + } + + if (MRI->def_begin(Reg) == MRI->def_end()) { + DEBUG(dbgs() << "Register " + << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << " has NO defs\n"); + } else if (!MRI->hasOneDef(Reg)) { + DEBUG(dbgs() << "Register " + << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << " has multiple defs\n"); + } + + assert(MRI->hasOneDef(Reg) && "Register has multiple definitions"); + MachineOperand *Def = &(*(MRI->def_begin(Reg))); + MachineOperand *UseOperand = &(RI); + bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB; + if (UseIsOutsideDefMBB && UseOperand->isKill()) { + DEBUG(dbgs() << "Removing kill flag on register: " + << PrintReg(Reg, TRI) << "\n"); + UseOperand->setIsKill(false); + } + } + } + } + } + } +} + +void LinearizedRegion::initLiveOut(RegionMRT *Region, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + storeLiveOuts(Region, MRI, TRI, PHIInfo); +} + +LinearizedRegion::LinearizedRegion(MachineBasicBlock *MBB, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + setEntry(MBB); + setExit(MBB); + storeLiveOuts(MBB, MRI, TRI, PHIInfo); + MBBs.insert(MBB); + Parent = nullptr; +} + +LinearizedRegion::LinearizedRegion() { + setEntry(nullptr); + setExit(nullptr); + Parent = nullptr; +} + +LinearizedRegion::~LinearizedRegion() {} + +class AMDGPUMachineCFGStructurizer : public MachineFunctionPass { +private: + const MachineRegionInfo *Regions; + const SIInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + unsigned BBSelectRegister; + PHILinearize PHIInfo; + DenseMap<MachineBasicBlock *, MachineBasicBlock *> FallthroughMap; + + void getPHIRegionIndices(RegionMRT *Region, MachineInstr &PHI, + SmallVector<unsigned, 2> &RegionIndices); + void getPHIRegionIndices(LinearizedRegion *Region, MachineInstr &PHI, + SmallVector<unsigned, 2> &RegionIndices); + void getPHINonRegionIndices(LinearizedRegion *Region, MachineInstr &PHI, + SmallVector<unsigned, 2> &PHINonRegionIndices); + + void storePHILinearizationInfoDest( + unsigned LDestReg, MachineInstr &PHI, + SmallVector<unsigned, 2> *RegionIndices = nullptr); + + unsigned storePHILinearizationInfo(MachineInstr &PHI, + SmallVector<unsigned, 2> *RegionIndices); + + void extractKilledPHIs(MachineBasicBlock *MBB); + + bool shrinkPHI(MachineInstr &PHI, SmallVector<unsigned, 2> &PHIIndices, + unsigned *ReplaceReg); + + bool shrinkPHI(MachineInstr &PHI, unsigned CombinedSourceReg, + MachineBasicBlock *SourceMBB, + SmallVector<unsigned, 2> &PHIIndices, unsigned *ReplaceReg); + + void replacePHI(MachineInstr &PHI, unsigned CombinedSourceReg, + MachineBasicBlock *LastMerge, + SmallVector<unsigned, 2> &PHIRegionIndices); + void replaceEntryPHI(MachineInstr &PHI, unsigned CombinedSourceReg, + MachineBasicBlock *IfMBB, + SmallVector<unsigned, 2> &PHIRegionIndices); + void replaceLiveOutRegs(MachineInstr &PHI, + SmallVector<unsigned, 2> &PHIRegionIndices, + unsigned CombinedSourceReg, + LinearizedRegion *LRegion); + void rewriteRegionExitPHI(RegionMRT *Region, MachineBasicBlock *LastMerge, + MachineInstr &PHI, LinearizedRegion *LRegion); + + void rewriteRegionExitPHIs(RegionMRT *Region, MachineBasicBlock *LastMerge, + LinearizedRegion *LRegion); + void rewriteRegionEntryPHI(LinearizedRegion *Region, MachineBasicBlock *IfMBB, + MachineInstr &PHI); + void rewriteRegionEntryPHIs(LinearizedRegion *Region, + MachineBasicBlock *IfMBB); + + bool regionIsSimpleIf(RegionMRT *Region); + + void transformSimpleIfRegion(RegionMRT *Region); + + void eliminateDeadBranchOperands(MachineBasicBlock::instr_iterator &II); + + void insertUnconditionalBranch(MachineBasicBlock *MBB, + MachineBasicBlock *Dest, + const DebugLoc &DL = DebugLoc()); + + MachineBasicBlock *createLinearizedExitBlock(RegionMRT *Region); + + void insertMergePHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, unsigned DestRegister, + unsigned IfSourceRegister, unsigned CodeSourceRegister, + bool IsUndefIfSource = false); + + MachineBasicBlock *createIfBlock(MachineBasicBlock *MergeBB, + MachineBasicBlock *CodeBBStart, + MachineBasicBlock *CodeBBEnd, + MachineBasicBlock *SelectBB, unsigned IfReg, + bool InheritPreds); + + void prunePHIInfo(MachineBasicBlock *MBB); + void createEntryPHI(LinearizedRegion *CurrentRegion, unsigned DestReg); + + void createEntryPHIs(LinearizedRegion *CurrentRegion); + void resolvePHIInfos(MachineBasicBlock *FunctionEntry); + + void replaceRegisterWith(unsigned Register, unsigned NewRegister); + + MachineBasicBlock *createIfRegion(MachineBasicBlock *MergeBB, + MachineBasicBlock *CodeBB, + LinearizedRegion *LRegion, + unsigned BBSelectRegIn, + unsigned BBSelectRegOut); + + MachineBasicBlock * + createIfRegion(MachineBasicBlock *MergeMBB, LinearizedRegion *InnerRegion, + LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB, + unsigned BBSelectRegIn, unsigned BBSelectRegOut); + void ensureCondIsNotKilled(SmallVector<MachineOperand, 1> Cond); + + void rewriteCodeBBTerminator(MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + unsigned BBSelectReg); + + MachineInstr *getDefInstr(unsigned Reg); + void insertChainedPHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + LinearizedRegion *InnerRegion, unsigned DestReg, + unsigned SourceReg); + bool containsDef(MachineBasicBlock *MBB, LinearizedRegion *InnerRegion, + unsigned Register); + void rewriteLiveOutRegs(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + LinearizedRegion *InnerRegion, + LinearizedRegion *LRegion); + + void splitLoopPHI(MachineInstr &PHI, MachineBasicBlock *Entry, + MachineBasicBlock *EntrySucc, LinearizedRegion *LRegion); + void splitLoopPHIs(MachineBasicBlock *Entry, MachineBasicBlock *EntrySucc, + LinearizedRegion *LRegion); + + MachineBasicBlock *splitExit(LinearizedRegion *LRegion); + + MachineBasicBlock *splitEntry(LinearizedRegion *LRegion); + + LinearizedRegion *initLinearizedRegion(RegionMRT *Region); + + bool structurizeComplexRegion(RegionMRT *Region); + + bool structurizeRegion(RegionMRT *Region); + + bool structurizeRegions(RegionMRT *Region, bool isTopRegion); + +public: + static char ID; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineRegionInfoPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) { + initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry()); + } + + void initFallthroughMap(MachineFunction &MF); + + void createLinearizedRegion(RegionMRT *Region, unsigned SelectOut); + + unsigned initializeSelectRegisters(MRT *MRT, unsigned ExistingExitReg, + MachineRegisterInfo *MRI, + const SIInstrInfo *TII); + + RegionMRT *RMRT; + void setRegionMRT(RegionMRT *RegionTree) { RMRT = RegionTree; } + + RegionMRT *getRegionMRT() { return RMRT; } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} + +char AMDGPUMachineCFGStructurizer::ID = 0; + +bool AMDGPUMachineCFGStructurizer::regionIsSimpleIf(RegionMRT *Region) { + MachineBasicBlock *Entry = Region->getEntry(); + MachineBasicBlock *Succ = Region->getSucc(); + bool FoundBypass = false; + bool FoundIf = false; + + if (Entry->succ_size() != 2) { + return false; + } + + for (MachineBasicBlock::const_succ_iterator SI = Entry->succ_begin(), + E = Entry->succ_end(); + SI != E; ++SI) { + MachineBasicBlock *Current = *SI; + + if (Current == Succ) { + FoundBypass = true; + } else if ((Current->succ_size() == 1) && + *(Current->succ_begin()) == Succ) { + FoundIf = true; + } + } + + return FoundIf && FoundBypass; +} + +void AMDGPUMachineCFGStructurizer::transformSimpleIfRegion(RegionMRT *Region) { + MachineBasicBlock *Entry = Region->getEntry(); + MachineBasicBlock *Exit = Region->getExit(); + TII->convertNonUniformIfRegion(Entry, Exit); +} + +static void fixMBBTerminator(MachineBasicBlock *MBB) { + + if (MBB->succ_size() == 1) { + auto *Succ = *(MBB->succ_begin()); + for (auto &TI : MBB->terminators()) { + for (auto &UI : TI.uses()) { + if (UI.isMBB() && UI.getMBB() != Succ) { + UI.setMBB(Succ); + } + } + } + } +} + +static void fixRegionTerminator(RegionMRT *Region) { + MachineBasicBlock *InternalSucc = nullptr; + MachineBasicBlock *ExternalSucc = nullptr; + LinearizedRegion *LRegion = Region->getLinearizedRegion(); + auto Exit = LRegion->getExit(); + + SmallPtrSet<MachineBasicBlock *, 2> Successors; + for (MachineBasicBlock::const_succ_iterator SI = Exit->succ_begin(), + SE = Exit->succ_end(); + SI != SE; ++SI) { + MachineBasicBlock *Succ = *SI; + if (LRegion->contains(Succ)) { + // Do not allow re-assign + assert(InternalSucc == nullptr); + InternalSucc = Succ; + } else { + // Do not allow re-assign + assert(ExternalSucc == nullptr); + ExternalSucc = Succ; + } + } + + for (auto &TI : Exit->terminators()) { + for (auto &UI : TI.uses()) { + if (UI.isMBB()) { + auto Target = UI.getMBB(); + if (Target != InternalSucc && Target != ExternalSucc) { + UI.setMBB(ExternalSucc); + } + } + } + } +} + +// If a region region is just a sequence of regions (and the exit +// block in the case of the top level region), we can simply skip +// linearizing it, because it is already linear +bool regionIsSequence(RegionMRT *Region) { + auto Children = Region->getChildren(); + for (auto CI : *Children) { + if (!CI->isRegion()) { + if (CI->getMBBMRT()->getMBB()->succ_size() > 1) { + return false; + } + } + } + return true; +} + +void fixupRegionExits(RegionMRT *Region) { + auto Children = Region->getChildren(); + for (auto CI : *Children) { + if (!CI->isRegion()) { + fixMBBTerminator(CI->getMBBMRT()->getMBB()); + } else { + fixRegionTerminator(CI->getRegionMRT()); + } + } +} + +void AMDGPUMachineCFGStructurizer::getPHIRegionIndices( + RegionMRT *Region, MachineInstr &PHI, + SmallVector<unsigned, 2> &PHIRegionIndices) { + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + MachineBasicBlock *Pred = getPHIPred(PHI, i); + if (Region->contains(Pred)) { + PHIRegionIndices.push_back(i); + } + } +} + +void AMDGPUMachineCFGStructurizer::getPHIRegionIndices( + LinearizedRegion *Region, MachineInstr &PHI, + SmallVector<unsigned, 2> &PHIRegionIndices) { + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + MachineBasicBlock *Pred = getPHIPred(PHI, i); + if (Region->contains(Pred)) { + PHIRegionIndices.push_back(i); + } + } +} + +void AMDGPUMachineCFGStructurizer::getPHINonRegionIndices( + LinearizedRegion *Region, MachineInstr &PHI, + SmallVector<unsigned, 2> &PHINonRegionIndices) { + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + MachineBasicBlock *Pred = getPHIPred(PHI, i); + if (!Region->contains(Pred)) { + PHINonRegionIndices.push_back(i); + } + } +} + +void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest( + unsigned LDestReg, MachineInstr &PHI, + SmallVector<unsigned, 2> *RegionIndices) { + if (RegionIndices) { + for (auto i : *RegionIndices) { + PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i)); + } + } else { + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i)); + } + } +} + +unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo( + MachineInstr &PHI, SmallVector<unsigned, 2> *RegionIndices) { + unsigned DestReg = getPHIDestReg(PHI); + unsigned LinearizeDestReg = + MRI->createVirtualRegister(MRI->getRegClass(DestReg)); + PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc()); + storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices); + return LinearizeDestReg; +} + +void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) { + // We need to create a new chain for the killed phi, but there is no + // need to do the renaming outside or inside the block. + SmallPtrSet<MachineInstr *, 2> PHIs; + for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(), + E = MBB->instr_end(); + I != E; ++I) { + MachineInstr &Instr = *I; + if (Instr.isPHI()) { + unsigned PHIDestReg = getPHIDestReg(Instr); + DEBUG(dbgs() << "Extractking killed phi:\n"); + DEBUG(Instr.dump()); + PHIs.insert(&Instr); + PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc()); + storePHILinearizationInfoDest(PHIDestReg, Instr); + } + } + + for (auto PI : PHIs) { + PI->eraseFromParent(); + } +} + +static bool isPHIRegionIndex(SmallVector<unsigned, 2> PHIRegionIndices, + unsigned Index) { + for (auto i : PHIRegionIndices) { + if (i == Index) + return true; + } + return false; +} + +bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, + SmallVector<unsigned, 2> &PHIIndices, + unsigned *ReplaceReg) { + return shrinkPHI(PHI, 0, nullptr, PHIIndices, ReplaceReg); +} + +bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, + unsigned CombinedSourceReg, + MachineBasicBlock *SourceMBB, + SmallVector<unsigned, 2> &PHIIndices, + unsigned *ReplaceReg) { + DEBUG(dbgs() << "Shrink PHI: "); + DEBUG(PHI.dump()); + DEBUG(dbgs() << " to " << PrintReg(getPHIDestReg(PHI), TRI) + << "<def> = PHI("); + + bool Replaced = false; + unsigned NumInputs = getPHINumInputs(PHI); + int SingleExternalEntryIndex = -1; + for (unsigned i = 0; i < NumInputs; ++i) { + if (!isPHIRegionIndex(PHIIndices, i)) { + if (SingleExternalEntryIndex == -1) { + // Single entry + SingleExternalEntryIndex = i; + } else { + // Multiple entries + SingleExternalEntryIndex = -2; + } + } + } + + if (SingleExternalEntryIndex > -1) { + *ReplaceReg = getPHISourceReg(PHI, SingleExternalEntryIndex); + // We should not rewrite the code, we should only pick up the single value + // that represents the shrunk PHI. + Replaced = true; + } else { + MachineBasicBlock *MBB = PHI.getParent(); + MachineInstrBuilder MIB = + BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), + getPHIDestReg(PHI)); + if (SourceMBB) { + MIB.addReg(CombinedSourceReg); + MIB.addMBB(SourceMBB); + DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#" + << SourceMBB->getNumber()); + } + + for (unsigned i = 0; i < NumInputs; ++i) { + if (isPHIRegionIndex(PHIIndices, i)) { + continue; + } + unsigned SourceReg = getPHISourceReg(PHI, i); + MachineBasicBlock *SourcePred = getPHIPred(PHI, i); + MIB.addReg(SourceReg); + MIB.addMBB(SourcePred); + DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" + << SourcePred->getNumber()); + } + DEBUG(dbgs() << ")\n"); + } + PHI.eraseFromParent(); + return Replaced; +} + +void AMDGPUMachineCFGStructurizer::replacePHI( + MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge, + SmallVector<unsigned, 2> &PHIRegionIndices) { + DEBUG(dbgs() << "Replace PHI: "); + DEBUG(PHI.dump()); + DEBUG(dbgs() << " with " << PrintReg(getPHIDestReg(PHI), TRI) + << "<def> = PHI("); + + bool HasExternalEdge = false; + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + if (!isPHIRegionIndex(PHIRegionIndices, i)) { + HasExternalEdge = true; + } + } + + if (HasExternalEdge) { + MachineBasicBlock *MBB = PHI.getParent(); + MachineInstrBuilder MIB = + BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), + getPHIDestReg(PHI)); + MIB.addReg(CombinedSourceReg); + MIB.addMBB(LastMerge); + DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#" + << LastMerge->getNumber()); + for (unsigned i = 0; i < NumInputs; ++i) { + if (isPHIRegionIndex(PHIRegionIndices, i)) { + continue; + } + unsigned SourceReg = getPHISourceReg(PHI, i); + MachineBasicBlock *SourcePred = getPHIPred(PHI, i); + MIB.addReg(SourceReg); + MIB.addMBB(SourcePred); + DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" + << SourcePred->getNumber()); + } + DEBUG(dbgs() << ")\n"); + } else { + replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg); + } + PHI.eraseFromParent(); +} + +void AMDGPUMachineCFGStructurizer::replaceEntryPHI( + MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB, + SmallVector<unsigned, 2> &PHIRegionIndices) { + + DEBUG(dbgs() << "Replace entry PHI: "); + DEBUG(PHI.dump()); + DEBUG(dbgs() << " with "); + + unsigned NumInputs = getPHINumInputs(PHI); + unsigned NumNonRegionInputs = NumInputs; + for (unsigned i = 0; i < NumInputs; ++i) { + if (isPHIRegionIndex(PHIRegionIndices, i)) { + NumNonRegionInputs--; + } + } + + if (NumNonRegionInputs == 0) { + auto DestReg = getPHIDestReg(PHI); + replaceRegisterWith(DestReg, CombinedSourceReg); + DEBUG(dbgs() << " register " << PrintReg(CombinedSourceReg, TRI) << "\n"); + PHI.eraseFromParent(); + } else { + DEBUG(dbgs() << PrintReg(getPHIDestReg(PHI), TRI) << "<def> = PHI("); + MachineBasicBlock *MBB = PHI.getParent(); + MachineInstrBuilder MIB = + BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), + getPHIDestReg(PHI)); + MIB.addReg(CombinedSourceReg); + MIB.addMBB(IfMBB); + DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#" + << IfMBB->getNumber()); + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + if (isPHIRegionIndex(PHIRegionIndices, i)) { + continue; + } + unsigned SourceReg = getPHISourceReg(PHI, i); + MachineBasicBlock *SourcePred = getPHIPred(PHI, i); + MIB.addReg(SourceReg); + MIB.addMBB(SourcePred); + DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" + << SourcePred->getNumber()); + } + DEBUG(dbgs() << ")\n"); + PHI.eraseFromParent(); + } +} + +void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs( + MachineInstr &PHI, SmallVector<unsigned, 2> &PHIRegionIndices, + unsigned CombinedSourceReg, LinearizedRegion *LRegion) { + bool WasLiveOut = false; + for (auto PII : PHIRegionIndices) { + unsigned Reg = getPHISourceReg(PHI, PII); + if (LRegion->isLiveOut(Reg)) { + bool IsDead = true; + + // Check if register is live out of the basic block + MachineBasicBlock *DefMBB = getDefInstr(Reg)->getParent(); + for (auto UI = MRI->use_begin(Reg), E = MRI->use_end(); UI != E; ++UI) { + if ((*UI).getParent()->getParent() != DefMBB) { + IsDead = false; + } + } + + DEBUG(dbgs() << "Register " << PrintReg(Reg, TRI) << " is " + << (IsDead ? "dead" : "alive") << " after PHI replace\n"); + if (IsDead) { + LRegion->removeLiveOut(Reg); + } + WasLiveOut = true; + } + } + + if (WasLiveOut) + LRegion->addLiveOut(CombinedSourceReg); +} + +void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHI(RegionMRT *Region, + MachineBasicBlock *LastMerge, + MachineInstr &PHI, + LinearizedRegion *LRegion) { + SmallVector<unsigned, 2> PHIRegionIndices; + getPHIRegionIndices(Region, PHI, PHIRegionIndices); + unsigned LinearizedSourceReg = + storePHILinearizationInfo(PHI, &PHIRegionIndices); + + replacePHI(PHI, LinearizedSourceReg, LastMerge, PHIRegionIndices); + replaceLiveOutRegs(PHI, PHIRegionIndices, LinearizedSourceReg, LRegion); +} + +void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHI(LinearizedRegion *Region, + MachineBasicBlock *IfMBB, + MachineInstr &PHI) { + SmallVector<unsigned, 2> PHINonRegionIndices; + getPHINonRegionIndices(Region, PHI, PHINonRegionIndices); + unsigned LinearizedSourceReg = + storePHILinearizationInfo(PHI, &PHINonRegionIndices); + replaceEntryPHI(PHI, LinearizedSourceReg, IfMBB, PHINonRegionIndices); +} + +static void collectPHIs(MachineBasicBlock *MBB, + SmallVector<MachineInstr *, 2> &PHIs) { + for (auto &BBI : *MBB) { + if (BBI.isPHI()) { + PHIs.push_back(&BBI); + } + } +} + +void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHIs(RegionMRT *Region, + MachineBasicBlock *LastMerge, + LinearizedRegion *LRegion) { + SmallVector<MachineInstr *, 2> PHIs; + auto Exit = Region->getSucc(); + if (Exit == nullptr) + return; + + collectPHIs(Exit, PHIs); + + for (auto PHII : PHIs) { + rewriteRegionExitPHI(Region, LastMerge, *PHII, LRegion); + } +} + +void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Region, + MachineBasicBlock *IfMBB) { + SmallVector<MachineInstr *, 2> PHIs; + auto Entry = Region->getEntry(); + + collectPHIs(Entry, PHIs); + + for (auto PHII : PHIs) { + rewriteRegionEntryPHI(Region, IfMBB, *PHII); + } +} + +void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB, + MachineBasicBlock *Dest, + const DebugLoc &DL) { + DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber() + << " -> " << Dest->getNumber() << "\n"); + MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator(); + bool HasTerminator = Terminator != MBB->instr_end(); + if (HasTerminator) { + TII->ReplaceTailWithBranchTo(Terminator, Dest); + } + if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(Dest)) { + TII->insertUnconditionalBranch(*MBB, Dest, DL); + } +} + +static MachineBasicBlock *getSingleExitNode(MachineFunction &MF) { + MachineBasicBlock *result = nullptr; + for (auto &MFI : MF) { + if (MFI.succ_size() == 0) { + if (result == nullptr) { + result = &MFI; + } else { + return nullptr; + } + } + } + + return result; +} + +static bool hasOneExitNode(MachineFunction &MF) { + return getSingleExitNode(MF) != nullptr; +} + +MachineBasicBlock * +AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) { + auto Exit = Region->getSucc(); + + // If the exit is the end of the function, we just use the existing + MachineFunction *MF = Region->getEntry()->getParent(); + if (Exit == nullptr && hasOneExitNode(*MF)) { + return &(*(--(Region->getEntry()->getParent()->end()))); + } + + MachineBasicBlock *LastMerge = MF->CreateMachineBasicBlock(); + if (Exit == nullptr) { + MachineFunction::iterator ExitIter = MF->end(); + MF->insert(ExitIter, LastMerge); + } else { + MachineFunction::iterator ExitIter = Exit->getIterator(); + MF->insert(ExitIter, LastMerge); + LastMerge->addSuccessor(Exit); + insertUnconditionalBranch(LastMerge, Exit); + DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n"); + } + return LastMerge; +} + +void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB, + MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + unsigned DestRegister, + unsigned IfSourceRegister, + unsigned CodeSourceRegister, + bool IsUndefIfSource) { + // If this is the function exit block, we don't need a phi. + if (MergeBB->succ_begin() == MergeBB->succ_end()) { + return; + } + DEBUG(dbgs() << "Merge PHI (BB#" << MergeBB->getNumber() + << "): " << PrintReg(DestRegister, TRI) << "<def> = PHI(" + << PrintReg(IfSourceRegister, TRI) << ", BB#" + << IfBB->getNumber() << PrintReg(CodeSourceRegister, TRI) + << ", BB#" << CodeBB->getNumber() << ")\n"); + const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin()); + MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL, + TII->get(TargetOpcode::PHI), DestRegister); + if (IsUndefIfSource && false) { + MIB.addReg(IfSourceRegister, RegState::Undef); + } else { + MIB.addReg(IfSourceRegister); + } + MIB.addMBB(IfBB); + MIB.addReg(CodeSourceRegister); + MIB.addMBB(CodeBB); +} + +static void removeExternalCFGSuccessors(MachineBasicBlock *MBB) { + for (MachineBasicBlock::succ_iterator PI = MBB->succ_begin(), + E = MBB->succ_end(); + PI != E; ++PI) { + if ((*PI) != MBB) { + (MBB)->removeSuccessor(*PI); + } + } +} + +static void removeExternalCFGEdges(MachineBasicBlock *StartMBB, + MachineBasicBlock *EndMBB) { + + // We have to check against the StartMBB successor becasuse a + // structurized region with a loop will have the entry block split, + // and the backedge will go to the entry successor. + DenseSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>> Succs; + unsigned SuccSize = StartMBB->succ_size(); + if (SuccSize > 0) { + MachineBasicBlock *StartMBBSucc = *(StartMBB->succ_begin()); + for (MachineBasicBlock::succ_iterator PI = EndMBB->succ_begin(), + E = EndMBB->succ_end(); + PI != E; ++PI) { + // Either we have a back-edge to the entry block, or a back-edge to the + // successor of the entry block since the block may be split. + if ((*PI) != StartMBB && + !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) { + Succs.insert( + std::pair<MachineBasicBlock *, MachineBasicBlock *>(EndMBB, *PI)); + } + } + } + + for (MachineBasicBlock::pred_iterator PI = StartMBB->pred_begin(), + E = StartMBB->pred_end(); + PI != E; ++PI) { + if ((*PI) != EndMBB) { + Succs.insert( + std::pair<MachineBasicBlock *, MachineBasicBlock *>(*PI, StartMBB)); + } + } + + for (auto SI : Succs) { + std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI; + DEBUG(dbgs() << "Removing edge: BB#" << Edge.first->getNumber() << " -> BB#" + << Edge.second->getNumber() << "\n"); + Edge.first->removeSuccessor(Edge.second); + } +} + +MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock( + MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBBStart, + MachineBasicBlock *CodeBBEnd, MachineBasicBlock *SelectBB, unsigned IfReg, + bool InheritPreds) { + MachineFunction *MF = MergeBB->getParent(); + MachineBasicBlock *IfBB = MF->CreateMachineBasicBlock(); + + if (InheritPreds) { + for (MachineBasicBlock::pred_iterator PI = CodeBBStart->pred_begin(), + E = CodeBBStart->pred_end(); + PI != E; ++PI) { + if ((*PI) != CodeBBEnd) { + MachineBasicBlock *Pred = (*PI); + Pred->addSuccessor(IfBB); + } + } + } + + removeExternalCFGEdges(CodeBBStart, CodeBBEnd); + + auto CodeBBStartI = CodeBBStart->getIterator(); + auto CodeBBEndI = CodeBBEnd->getIterator(); + auto MergeIter = MergeBB->getIterator(); + MF->insert(MergeIter, IfBB); + MF->splice(MergeIter, CodeBBStartI, ++CodeBBEndI); + IfBB->addSuccessor(MergeBB); + IfBB->addSuccessor(CodeBBStart); + + DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n"); + // Ensure that the MergeBB is a successor of the CodeEndBB. + if (!CodeBBEnd->isSuccessor(MergeBB)) + CodeBBEnd->addSuccessor(MergeBB); + + DEBUG(dbgs() << "Moved MBB#" << CodeBBStart->getNumber() << " through MBB#" + << CodeBBEnd->getNumber() << "\n"); + + // If we have a single predecessor we can find a reasonable debug location + MachineBasicBlock *SinglePred = + CodeBBStart->pred_size() == 1 ? *(CodeBBStart->pred_begin()) : nullptr; + const DebugLoc &DL = SinglePred + ? SinglePred->findDebugLoc(SinglePred->getFirstTerminator()) + : DebugLoc(); + + unsigned Reg = + TII->insertEQ(IfBB, IfBB->begin(), DL, IfReg, + SelectBB->getNumber() /* CodeBBStart->getNumber() */); + if (&(*(IfBB->getParent()->begin())) == IfBB) { + TII->materializeImmediate(*IfBB, IfBB->begin(), DL, IfReg, + CodeBBStart->getNumber()); + } + MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true); + ArrayRef<MachineOperand> Cond(RegOp); + TII->insertBranch(*IfBB, MergeBB, CodeBBStart, Cond, DL); + + return IfBB; +} + +void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled( + SmallVector<MachineOperand, 1> Cond) { + if (Cond.size() != 1) + return; + if (!Cond[0].isReg()) + return; + + unsigned CondReg = Cond[0].getReg(); + for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) { + (*UI).setIsKill(false); + } +} + +void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + unsigned BBSelectReg) { + MachineBasicBlock *TrueBB = nullptr; + MachineBasicBlock *FalseBB = nullptr; + SmallVector<MachineOperand, 1> Cond; + MachineBasicBlock *FallthroughBB = FallthroughMap[CodeBB]; + TII->analyzeBranch(*CodeBB, TrueBB, FalseBB, Cond); + + const DebugLoc &DL = CodeBB->findDebugLoc(CodeBB->getFirstTerminator()); + + if (FalseBB == nullptr && TrueBB == nullptr && FallthroughBB == nullptr) { + // This is an exit block, hence no successors. We will assign the + // bb select register to the entry block. + TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, + BBSelectReg, + CodeBB->getParent()->begin()->getNumber()); + insertUnconditionalBranch(CodeBB, MergeBB, DL); + return; + } + + if (FalseBB == nullptr && TrueBB == nullptr) { + TrueBB = FallthroughBB; + } else if (TrueBB != nullptr) { + FalseBB = + (FallthroughBB && (FallthroughBB != TrueBB)) ? FallthroughBB : FalseBB; + } + + if ((TrueBB != nullptr && FalseBB == nullptr) || (TrueBB == FalseBB)) { + TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, + BBSelectReg, TrueBB->getNumber()); + } else { + const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg); + unsigned TrueBBReg = MRI->createVirtualRegister(RegClass); + unsigned FalseBBReg = MRI->createVirtualRegister(RegClass); + TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, + TrueBBReg, TrueBB->getNumber()); + TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, + FalseBBReg, FalseBB->getNumber()); + ensureCondIsNotKilled(Cond); + TII->insertVectorSelect(*CodeBB, CodeBB->getFirstTerminator(), DL, + BBSelectReg, Cond, TrueBBReg, FalseBBReg); + } + + insertUnconditionalBranch(CodeBB, MergeBB, DL); +} + +MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) { + if (MRI->def_begin(Reg) == MRI->def_end()) { + DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << " has NO defs\n"); + } else if (!MRI->hasOneDef(Reg)) { + DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << " has multiple defs\n"); + DEBUG(dbgs() << "DEFS BEGIN:\n"); + for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) { + DEBUG(DI->getParent()->dump()); + } + DEBUG(dbgs() << "DEFS END\n"); + } + + assert(MRI->hasOneDef(Reg) && "Register has multiple definitions"); + return (*(MRI->def_begin(Reg))).getParent(); +} + +void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB, + MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + LinearizedRegion *InnerRegion, + unsigned DestReg, + unsigned SourceReg) { + // In this function we know we are part of a chain already, so we need + // to add the registers to the existing chain, and rename the register + // inside the region. + bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit(); + MachineInstr *DefInstr = getDefInstr(SourceReg); + if (DefInstr->isPHI() && DefInstr->getParent() == CodeBB && IsSingleBB) { + // Handle the case where the def is a PHI-def inside a basic + // block, then we only need to do renaming. Special care needs to + // be taken if the PHI-def is part of an existing chain, or if a + // new one needs to be created. + InnerRegion->replaceRegisterInsideRegion(SourceReg, DestReg, true, MRI); + + // We collect all PHI Information, and if we are at the region entry, + // all PHIs will be removed, and then re-introduced if needed. + storePHILinearizationInfoDest(DestReg, *DefInstr); + // We have picked up all the information we need now and can remove + // the PHI + PHIInfo.removeSource(DestReg, SourceReg, CodeBB); + DefInstr->eraseFromParent(); + } else { + // If this is not a phi-def, or it is a phi-def but from a linearized region + if (IsSingleBB && DefInstr->getParent() == InnerRegion->getEntry()) { + // If this is a single BB and the definition is in this block we + // need to replace any uses outside the region. + InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI); + } + const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg); + unsigned NextDestReg = MRI->createVirtualRegister(RegClass); + bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1; + DEBUG(dbgs() << "Insert Chained PHI\n"); + insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg, + SourceReg, IsLastDef); + + PHIInfo.removeSource(DestReg, SourceReg, CodeBB); + if (IsLastDef) { + const DebugLoc &DL = IfBB->findDebugLoc(IfBB->getFirstTerminator()); + TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DL, + NextDestReg, 0); + PHIInfo.deleteDef(DestReg); + } else { + PHIInfo.replaceDef(DestReg, NextDestReg); + } + } +} + +bool AMDGPUMachineCFGStructurizer::containsDef(MachineBasicBlock *MBB, + LinearizedRegion *InnerRegion, + unsigned Register) { + return getDefInstr(Register)->getParent() == MBB || + InnerRegion->contains(getDefInstr(Register)->getParent()); +} + +void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB, + MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + LinearizedRegion *InnerRegion, + LinearizedRegion *LRegion) { + DenseSet<unsigned> *LiveOuts = InnerRegion->getLiveOuts(); + SmallVector<unsigned, 4> OldLiveOuts; + bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit(); + for (auto OLI : *LiveOuts) { + OldLiveOuts.push_back(OLI); + } + + for (auto LI : OldLiveOuts) { + DEBUG(dbgs() << "LiveOut: " << PrintReg(LI, TRI)); + if (!containsDef(CodeBB, InnerRegion, LI) || + (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) { + // If the register simly lives through the CodeBB, we don't have + // to rewrite anything since the register is not defined in this + // part of the code. + DEBUG(dbgs() << "- through"); + continue; + } + DEBUG(dbgs() << "\n"); + unsigned Reg = LI; + if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) { + // If the register is live out, we do want to create a phi, + // unless it is from the Exit block, becasuse in that case there + // is already a PHI, and no need to create a new one. + + // If the register is just a live out def and not part of a phi + // chain, we need to create a PHI node to handle the if region, + // and replace all uses outside of the region with the new dest + // register, unless it is the outgoing BB select register. We have + // already creaed phi nodes for these. + const TargetRegisterClass *RegClass = MRI->getRegClass(Reg); + unsigned PHIDestReg = MRI->createVirtualRegister(RegClass); + unsigned IfSourceReg = MRI->createVirtualRegister(RegClass); + // Create initializer, this value is never used, but is needed + // to satisfy SSA. + DEBUG(dbgs() << "Initializer for reg: " << PrintReg(Reg) << "\n"); + TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(), + IfSourceReg, 0); + + InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI); + DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n"); + insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg, + IfSourceReg, Reg, true); + } + } + + // Handle the chained definitions in PHIInfo, checking if this basic block + // is a source block for a definition. + SmallVector<unsigned, 4> Sources; + if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) { + DEBUG(dbgs() << "Inserting PHI Live Out from BB#" << CodeBB->getNumber() + << "\n"); + for (auto SI : Sources) { + unsigned DestReg; + PHIInfo.findDest(SI, CodeBB, DestReg); + insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI); + } + DEBUG(dbgs() << "Insertion done.\n"); + } + + DEBUG(PHIInfo.dump(MRI)); +} + +void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) { + DEBUG(dbgs() << "Before PHI Prune\n"); + DEBUG(PHIInfo.dump(MRI)); + SmallVector<std::tuple<unsigned, unsigned, MachineBasicBlock *>, 4> + ElimiatedSources; + for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; + ++DRI) { + + unsigned DestReg = *DRI; + auto SE = PHIInfo.sources_end(DestReg); + + bool MBBContainsPHISource = false; + // Check if there is a PHI source in this MBB + for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { + unsigned SourceReg = (*SRI).first; + MachineOperand *Def = &(*(MRI->def_begin(SourceReg))); + if (Def->getParent()->getParent() == MBB) { + MBBContainsPHISource = true; + } + } + + // If so, all other sources are useless since we know this block + // is always executed when the region is executed. + if (MBBContainsPHISource) { + for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { + PHILinearize::PHISourceT Source = *SRI; + unsigned SourceReg = Source.first; + MachineBasicBlock *SourceMBB = Source.second; + MachineOperand *Def = &(*(MRI->def_begin(SourceReg))); + if (Def->getParent()->getParent() != MBB) { + ElimiatedSources.push_back( + std::make_tuple(DestReg, SourceReg, SourceMBB)); + } + } + } + } + + // Remove the PHI sources that are in the given MBB + for (auto &SourceInfo : ElimiatedSources) { + PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo), + std::get<2>(SourceInfo)); + } + DEBUG(dbgs() << "After PHI Prune\n"); + DEBUG(PHIInfo.dump(MRI)); +} + +void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion, + unsigned DestReg) { + MachineBasicBlock *Entry = CurrentRegion->getEntry(); + MachineBasicBlock *Exit = CurrentRegion->getExit(); + + DEBUG(dbgs() << "RegionExit: " << Exit->getNumber() + << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n"); + + int NumSources = 0; + auto SE = PHIInfo.sources_end(DestReg); + + for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { + NumSources++; + } + + if (NumSources == 1) { + auto SRI = PHIInfo.sources_begin(DestReg); + unsigned SourceReg = (*SRI).first; + replaceRegisterWith(DestReg, SourceReg); + } else { + const DebugLoc &DL = Entry->findDebugLoc(Entry->begin()); + MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL, + TII->get(TargetOpcode::PHI), DestReg); + DEBUG(dbgs() << "Entry PHI " << PrintReg(DestReg, TRI) << "<def> = PHI("); + + unsigned CurrentBackedgeReg = 0; + + for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { + unsigned SourceReg = (*SRI).first; + + if (CurrentRegion->contains((*SRI).second)) { + if (CurrentBackedgeReg == 0) { + CurrentBackedgeReg = SourceReg; + } else { + MachineInstr *PHIDefInstr = getDefInstr(SourceReg); + MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent(); + const TargetRegisterClass *RegClass = + MRI->getRegClass(CurrentBackedgeReg); + unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass); + MachineInstrBuilder BackedgePHI = + BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL, + TII->get(TargetOpcode::PHI), NewBackedgeReg); + BackedgePHI.addReg(CurrentBackedgeReg); + BackedgePHI.addMBB(getPHIPred(*PHIDefInstr, 0)); + BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1)); + BackedgePHI.addMBB((*SRI).second); + CurrentBackedgeReg = NewBackedgeReg; + DEBUG(dbgs() << "Inserting backedge PHI: " + << PrintReg(NewBackedgeReg, TRI) << "<def> = PHI(" + << PrintReg(CurrentBackedgeReg, TRI) << ", BB#" + << getPHIPred(*PHIDefInstr, 0)->getNumber() << ", " + << PrintReg(getPHISourceReg(*PHIDefInstr, 1), TRI) + << ", BB#" << (*SRI).second->getNumber()); + } + } else { + MIB.addReg(SourceReg); + MIB.addMBB((*SRI).second); + DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" + << (*SRI).second->getNumber() << ", "); + } + } + + // Add the final backedge register source to the entry phi + if (CurrentBackedgeReg != 0) { + MIB.addReg(CurrentBackedgeReg); + MIB.addMBB(Exit); + DEBUG(dbgs() << PrintReg(CurrentBackedgeReg, TRI) << ", BB#" + << Exit->getNumber() << ")\n"); + } else { + DEBUG(dbgs() << ")\n"); + } + } +} + +void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) { + DEBUG(PHIInfo.dump(MRI)); + + for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; + ++DRI) { + + unsigned DestReg = *DRI; + createEntryPHI(CurrentRegion, DestReg); + } + PHIInfo.clear(); +} + +void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register, + unsigned NewRegister) { + assert(Register != NewRegister && "Cannot replace a reg with itself"); + + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register), + E = MRI->reg_end(); + I != E;) { + MachineOperand &O = *I; + ++I; + if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { + DEBUG(dbgs() << "Trying to substitute physical register: " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); + llvm_unreachable("Cannot substitute physical registers"); + // We don't handle physical registers, but if we need to + // in the future This is how we do it: + // O.substPhysReg(NewRegister, *TRI); + } else { + DEBUG(dbgs() << "Replacing register: " + << PrintReg(Register, MRI->getTargetRegisterInfo()) + << " with " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); + O.setReg(NewRegister); + } + } + PHIInfo.deleteDef(Register); + + getRegionMRT()->replaceLiveOutReg(Register, NewRegister); + + DEBUG(PHIInfo.dump(MRI)); +} + +void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) { + DEBUG(dbgs() << "Resolve PHI Infos\n"); + DEBUG(PHIInfo.dump(MRI)); + for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; + ++DRI) { + unsigned DestReg = *DRI; + DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) << "\n"); + auto SRI = PHIInfo.sources_begin(DestReg); + unsigned SourceReg = (*SRI).first; + DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) + << " SourceReg: " << PrintReg(SourceReg, TRI) << "\n"); + + assert(PHIInfo.sources_end(DestReg) == ++SRI && + "More than one phi source in entry node"); + replaceRegisterWith(DestReg, SourceReg); + } +} + +static bool isFunctionEntryBlock(MachineBasicBlock *MBB) { + return ((&(*(MBB->getParent()->begin()))) == MBB); +} + +MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( + MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBB, + LinearizedRegion *CurrentRegion, unsigned BBSelectRegIn, + unsigned BBSelectRegOut) { + if (isFunctionEntryBlock(CodeBB) && !CurrentRegion->getHasLoop()) { + // Handle non-loop function entry block. + // We need to allow loops to the entry block and then + rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut); + resolvePHIInfos(CodeBB); + removeExternalCFGSuccessors(CodeBB); + CodeBB->addSuccessor(MergeBB); + CurrentRegion->addMBB(CodeBB); + return nullptr; + } + if (CurrentRegion->getEntry() == CodeBB && !CurrentRegion->getHasLoop()) { + // Handle non-loop region entry block. + MachineFunction *MF = MergeBB->getParent(); + auto MergeIter = MergeBB->getIterator(); + auto CodeBBStartIter = CodeBB->getIterator(); + auto CodeBBEndIter = ++(CodeBB->getIterator()); + if (CodeBBEndIter != MergeIter) { + MF->splice(MergeIter, CodeBBStartIter, CodeBBEndIter); + } + rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut); + prunePHIInfo(CodeBB); + createEntryPHIs(CurrentRegion); + removeExternalCFGSuccessors(CodeBB); + CodeBB->addSuccessor(MergeBB); + CurrentRegion->addMBB(CodeBB); + return nullptr; + } else { + // Handle internal block. + const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn); + unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass); + rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg); + bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB; + MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB, + BBSelectRegIn, IsRegionEntryBB); + CurrentRegion->addMBB(IfBB); + // If this is the entry block we need to make the If block the new + // linearized region entry. + if (IsRegionEntryBB) { + CurrentRegion->setEntry(IfBB); + + if (CurrentRegion->getHasLoop()) { + MachineBasicBlock *RegionExit = CurrentRegion->getExit(); + MachineBasicBlock *ETrueBB = nullptr; + MachineBasicBlock *EFalseBB = nullptr; + SmallVector<MachineOperand, 1> ECond; + + const DebugLoc &DL = DebugLoc(); + TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond); + TII->removeBranch(*RegionExit); + + // We need to create a backedge if there is a loop + unsigned Reg = TII->insertNE( + RegionExit, RegionExit->instr_end(), DL, + CurrentRegion->getRegionMRT()->getInnerOutputRegister(), + CurrentRegion->getRegionMRT()->getEntry()->getNumber()); + MachineOperand RegOp = + MachineOperand::CreateReg(Reg, false, false, true); + ArrayRef<MachineOperand> Cond(RegOp); + DEBUG(dbgs() << "RegionExitReg: "); + DEBUG(Cond[0].print(dbgs(), TRI)); + DEBUG(dbgs() << "\n"); + TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit, + Cond, DebugLoc()); + RegionExit->addSuccessor(CurrentRegion->getEntry()); + } + } + CurrentRegion->addMBB(CodeBB); + LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo); + + InnerRegion.setParent(CurrentRegion); + DEBUG(dbgs() << "Insert BB Select PHI (BB)\n"); + insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn, + CodeBBSelectReg); + InnerRegion.addMBB(MergeBB); + + DEBUG(InnerRegion.print(dbgs(), TRI)); + rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion); + extractKilledPHIs(CodeBB); + if (IsRegionEntryBB) { + createEntryPHIs(CurrentRegion); + } + return IfBB; + } +} + +MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( + MachineBasicBlock *MergeBB, LinearizedRegion *InnerRegion, + LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB, + unsigned BBSelectRegIn, unsigned BBSelectRegOut) { + unsigned CodeBBSelectReg = + InnerRegion->getRegionMRT()->getInnerOutputRegister(); + MachineBasicBlock *CodeEntryBB = InnerRegion->getEntry(); + MachineBasicBlock *CodeExitBB = InnerRegion->getExit(); + MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeEntryBB, CodeExitBB, + SelectBB, BBSelectRegIn, true); + CurrentRegion->addMBB(IfBB); + bool isEntry = CurrentRegion->getEntry() == InnerRegion->getEntry(); + if (isEntry) { + + if (CurrentRegion->getHasLoop()) { + MachineBasicBlock *RegionExit = CurrentRegion->getExit(); + MachineBasicBlock *ETrueBB = nullptr; + MachineBasicBlock *EFalseBB = nullptr; + SmallVector<MachineOperand, 1> ECond; + + const DebugLoc &DL = DebugLoc(); + TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond); + TII->removeBranch(*RegionExit); + + // We need to create a backedge if there is a loop + unsigned Reg = + TII->insertNE(RegionExit, RegionExit->instr_end(), DL, + CurrentRegion->getRegionMRT()->getInnerOutputRegister(), + CurrentRegion->getRegionMRT()->getEntry()->getNumber()); + MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true); + ArrayRef<MachineOperand> Cond(RegOp); + DEBUG(dbgs() << "RegionExitReg: "); + DEBUG(Cond[0].print(dbgs(), TRI)); + DEBUG(dbgs() << "\n"); + TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit, + Cond, DebugLoc()); + RegionExit->addSuccessor(IfBB); + } + } + CurrentRegion->addMBBs(InnerRegion); + DEBUG(dbgs() << "Insert BB Select PHI (region)\n"); + insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn, + CodeBBSelectReg); + + rewriteLiveOutRegs(IfBB, /* CodeEntryBB */ CodeExitBB, MergeBB, InnerRegion, + CurrentRegion); + + rewriteRegionEntryPHIs(InnerRegion, IfBB); + + if (isEntry) { + CurrentRegion->setEntry(IfBB); + } + + if (isEntry) { + createEntryPHIs(CurrentRegion); + } + + return IfBB; +} + +void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI, + MachineBasicBlock *Entry, + MachineBasicBlock *EntrySucc, + LinearizedRegion *LRegion) { + SmallVector<unsigned, 2> PHIRegionIndices; + getPHIRegionIndices(LRegion, PHI, PHIRegionIndices); + + assert(PHIRegionIndices.size() == 1); + + unsigned RegionIndex = PHIRegionIndices[0]; + unsigned RegionSourceReg = getPHISourceReg(PHI, RegionIndex); + MachineBasicBlock *RegionSourceMBB = getPHIPred(PHI, RegionIndex); + unsigned PHIDest = getPHIDestReg(PHI); + unsigned PHISource = PHIDest; + unsigned ReplaceReg; + + if (shrinkPHI(PHI, PHIRegionIndices, &ReplaceReg)) { + PHISource = ReplaceReg; + } + + const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest); + unsigned NewDestReg = MRI->createVirtualRegister(RegClass); + LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI); + MachineInstrBuilder MIB = + BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(), + TII->get(TargetOpcode::PHI), NewDestReg); + DEBUG(dbgs() << "Split Entry PHI " << PrintReg(NewDestReg, TRI) + << "<def> = PHI("); + MIB.addReg(PHISource); + MIB.addMBB(Entry); + DEBUG(dbgs() << PrintReg(PHISource, TRI) << ", BB#" << Entry->getNumber()); + MIB.addReg(RegionSourceReg); + MIB.addMBB(RegionSourceMBB); + DEBUG(dbgs() << " ," << PrintReg(RegionSourceReg, TRI) << ", BB#" + << RegionSourceMBB->getNumber() << ")\n"); +} + +void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry, + MachineBasicBlock *EntrySucc, + LinearizedRegion *LRegion) { + SmallVector<MachineInstr *, 2> PHIs; + collectPHIs(Entry, PHIs); + + for (auto PHII : PHIs) { + splitLoopPHI(*PHII, Entry, EntrySucc, LRegion); + } +} + +// Split the exit block so that we can insert a end control flow +MachineBasicBlock * +AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) { + auto MRTRegion = LRegion->getRegionMRT(); + auto Exit = LRegion->getExit(); + auto MF = Exit->getParent(); + auto Succ = MRTRegion->getSucc(); + + auto NewExit = MF->CreateMachineBasicBlock(); + auto AfterExitIter = Exit->getIterator(); + AfterExitIter++; + MF->insert(AfterExitIter, NewExit); + Exit->removeSuccessor(Succ); + Exit->addSuccessor(NewExit); + NewExit->addSuccessor(Succ); + insertUnconditionalBranch(NewExit, Succ); + LRegion->addMBB(NewExit); + LRegion->setExit(NewExit); + + DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n"); + + // Replace any PHI Predecessors in the successor with NewExit + for (auto &II : *Succ) { + MachineInstr &Instr = II; + + // If we are past the PHI instructions we are done + if (!Instr.isPHI()) + break; + + int numPreds = getPHINumInputs(Instr); + for (int i = 0; i < numPreds; ++i) { + auto Pred = getPHIPred(Instr, i); + if (Pred == Exit) { + setPhiPred(Instr, i, NewExit); + } + } + } + + return NewExit; +} + + +static MachineBasicBlock *split(MachineBasicBlock::iterator I) { + // Create the fall-through block. + MachineBasicBlock *MBB = (*I).getParent(); + MachineFunction *MF = MBB->getParent(); + MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock(); + auto MBBIter = ++(MBB->getIterator()); + MF->insert(MBBIter, SuccMBB); + SuccMBB->transferSuccessorsAndUpdatePHIs(MBB); + MBB->addSuccessor(SuccMBB); + + // Splice the code over. + SuccMBB->splice(SuccMBB->end(), MBB, I, MBB->end()); + + return SuccMBB; +} + +// Split the entry block separating PHI-nodes and the rest of the code +// This is needed to insert an initializer for the bb select register +// inloop regions. + +MachineBasicBlock * +AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) { + MachineBasicBlock *Entry = LRegion->getEntry(); + MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI()); + MachineBasicBlock *Exit = LRegion->getExit(); + + DEBUG(dbgs() << "Split BB#" << Entry->getNumber() << " to BB#" + << Entry->getNumber() << " -> BB#" << EntrySucc->getNumber() + << "\n"); + LRegion->addMBB(EntrySucc); + + // Make the backedge go to Entry Succ + if (Exit->isSuccessor(Entry)) { + Exit->removeSuccessor(Entry); + } + Exit->addSuccessor(EntrySucc); + MachineInstr &Branch = *(Exit->instr_rbegin()); + for (auto &UI : Branch.uses()) { + if (UI.isMBB() && UI.getMBB() == Entry) { + UI.setMBB(EntrySucc); + } + } + + splitLoopPHIs(Entry, EntrySucc, LRegion); + + return EntrySucc; +} + +LinearizedRegion * +AMDGPUMachineCFGStructurizer::initLinearizedRegion(RegionMRT *Region) { + LinearizedRegion *LRegion = Region->getLinearizedRegion(); + LRegion->initLiveOut(Region, MRI, TRI, PHIInfo); + LRegion->setEntry(Region->getEntry()); + return LRegion; +} + +static void removeOldExitPreds(RegionMRT *Region) { + MachineBasicBlock *Exit = Region->getSucc(); + if (Exit == nullptr) { + return; + } + for (MachineBasicBlock::pred_iterator PI = Exit->pred_begin(), + E = Exit->pred_end(); + PI != E; ++PI) { + if (Region->contains(*PI)) { + (*PI)->removeSuccessor(Exit); + } + } +} + +static bool mbbHasBackEdge(MachineBasicBlock *MBB, + SmallPtrSet<MachineBasicBlock *, 8> &MBBs) { + for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) { + if (MBBs.count(*SI) != 0) { + return true; + } + } + return false; +} + +static bool containsNewBackedge(MRT *Tree, + SmallPtrSet<MachineBasicBlock *, 8> &MBBs) { + // Need to traverse this in reverse since it is in post order. + if (Tree == nullptr) + return false; + + if (Tree->isMBB()) { + MachineBasicBlock *MBB = Tree->getMBBMRT()->getMBB(); + MBBs.insert(MBB); + if (mbbHasBackEdge(MBB, MBBs)) { + return true; + } + } else { + RegionMRT *Region = Tree->getRegionMRT(); + SetVector<MRT *> *Children = Region->getChildren(); + for (auto CI = Children->rbegin(), CE = Children->rend(); CI != CE; ++CI) { + if (containsNewBackedge(*CI, MBBs)) + return true; + } + } + return false; +} + +static bool containsNewBackedge(RegionMRT *Region) { + SmallPtrSet<MachineBasicBlock *, 8> MBBs; + return containsNewBackedge(Region, MBBs); +} + +bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { + auto *LRegion = initLinearizedRegion(Region); + LRegion->setHasLoop(containsNewBackedge(Region)); + MachineBasicBlock *LastMerge = createLinearizedExitBlock(Region); + MachineBasicBlock *CurrentMerge = LastMerge; + LRegion->addMBB(LastMerge); + LRegion->setExit(LastMerge); + + rewriteRegionExitPHIs(Region, LastMerge, LRegion); + removeOldExitPreds(Region); + + DEBUG(PHIInfo.dump(MRI)); + + SetVector<MRT *> *Children = Region->getChildren(); + DEBUG(dbgs() << "===========If Region Start===============\n"); + if (LRegion->getHasLoop()) { + DEBUG(dbgs() << "Has Backedge: Yes\n"); + } else { + DEBUG(dbgs() << "Has Backedge: No\n"); + } + + unsigned BBSelectRegIn; + unsigned BBSelectRegOut; + for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) { + DEBUG(dbgs() << "CurrentRegion: \n"); + DEBUG(LRegion->print(dbgs(), TRI)); + + auto CNI = CI; + ++CNI; + + MRT *Child = (*CI); + + if (Child->isRegion()) { + + LinearizedRegion *InnerLRegion = + Child->getRegionMRT()->getLinearizedRegion(); + // We found the block is the exit of an inner region, we need + // to put it in the current linearized region. + + DEBUG(dbgs() << "Linearizing region: "); + DEBUG(InnerLRegion->print(dbgs(), TRI)); + DEBUG(dbgs() << "\n"); + + MachineBasicBlock *InnerEntry = InnerLRegion->getEntry(); + if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) { + // Entry has already been linearized, no need to do this region. + unsigned OuterSelect = InnerLRegion->getBBSelectRegOut(); + unsigned InnerSelectReg = + InnerLRegion->getRegionMRT()->getInnerOutputRegister(); + replaceRegisterWith(InnerSelectReg, OuterSelect), + resolvePHIInfos(InnerEntry); + if (!InnerLRegion->getExit()->isSuccessor(CurrentMerge)) + InnerLRegion->getExit()->addSuccessor(CurrentMerge); + continue; + } + + BBSelectRegOut = Child->getBBSelectRegOut(); + BBSelectRegIn = Child->getBBSelectRegIn(); + + DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI) + << "\n"); + DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI) + << "\n"); + + MachineBasicBlock *IfEnd = CurrentMerge; + CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion, + Child->getRegionMRT()->getEntry(), + BBSelectRegIn, BBSelectRegOut); + TII->convertNonUniformIfRegion(CurrentMerge, IfEnd); + } else { + MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB(); + DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n"); + + if (MBB == getSingleExitNode(*(MBB->getParent()))) { + // If this is the exit block then we need to skip to the next. + // The "in" register will be transferred to "out" in the next + // iteration. + continue; + } + + BBSelectRegOut = Child->getBBSelectRegOut(); + BBSelectRegIn = Child->getBBSelectRegIn(); + + DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI) + << "\n"); + DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI) + << "\n"); + + MachineBasicBlock *IfEnd = CurrentMerge; + // This is a basic block that is not part of an inner region, we + // need to put it in the current linearized region. + CurrentMerge = createIfRegion(CurrentMerge, MBB, LRegion, BBSelectRegIn, + BBSelectRegOut); + if (CurrentMerge) { + TII->convertNonUniformIfRegion(CurrentMerge, IfEnd); + } + + DEBUG(PHIInfo.dump(MRI)); + } + } + + LRegion->removeFalseRegisterKills(MRI); + + if (LRegion->getHasLoop()) { + MachineBasicBlock *NewSucc = splitEntry(LRegion); + if (isFunctionEntryBlock(LRegion->getEntry())) { + resolvePHIInfos(LRegion->getEntry()); + } + const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI()); + unsigned InReg = LRegion->getBBSelectRegIn(); + unsigned InnerSelectReg = + MRI->createVirtualRegister(MRI->getRegClass(InReg)); + unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg)); + TII->materializeImmediate(*(LRegion->getEntry()), + LRegion->getEntry()->getFirstTerminator(), DL, + NewInReg, Region->getEntry()->getNumber()); + // Need to be careful about updating the registers inside the region. + LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI); + DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n"); + insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc, + InnerSelectReg, NewInReg, + LRegion->getRegionMRT()->getInnerOutputRegister()); + splitExit(LRegion); + TII->convertNonUniformLoopRegion(NewSucc, LastMerge); + } + + if (Region->isRoot()) { + TII->insertReturn(*LastMerge); + } + + DEBUG(Region->getEntry()->getParent()->dump()); + DEBUG(LRegion->print(dbgs(), TRI)); + DEBUG(PHIInfo.dump(MRI)); + + DEBUG(dbgs() << "===========If Region End===============\n"); + + Region->setLinearizedRegion(LRegion); + return true; +} + +bool AMDGPUMachineCFGStructurizer::structurizeRegion(RegionMRT *Region) { + if (false && regionIsSimpleIf(Region)) { + transformSimpleIfRegion(Region); + return true; + } else if (regionIsSequence(Region)) { + fixupRegionExits(Region); + return false; + } else { + structurizeComplexRegion(Region); + } + return false; +} + +static int structurize_once = 0; + +bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region, + bool isTopRegion) { + bool Changed = false; + + auto Children = Region->getChildren(); + for (auto CI : *Children) { + if (CI->isRegion()) { + Changed |= structurizeRegions(CI->getRegionMRT(), false); + } + } + + if (structurize_once < 2 || true) { + Changed |= structurizeRegion(Region); + structurize_once++; + } + return Changed; +} + +void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) { + DEBUG(dbgs() << "Fallthrough Map:\n"); + for (auto &MBBI : MF) { + MachineBasicBlock *MBB = MBBI.getFallThrough(); + if (MBB != nullptr) { + DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> " + << MBB->getNumber() << "\n"); + } + FallthroughMap[&MBBI] = MBB; + } +} + +void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region, + unsigned SelectOut) { + LinearizedRegion *LRegion = new LinearizedRegion(); + if (SelectOut) { + LRegion->addLiveOut(SelectOut); + DEBUG(dbgs() << "Add LiveOut (BBSelect): " << PrintReg(SelectOut, TRI) + << "\n"); + } + LRegion->setRegionMRT(Region); + Region->setLinearizedRegion(LRegion); + LRegion->setParent(Region->getParent() + ? Region->getParent()->getLinearizedRegion() + : nullptr); +} + +unsigned +AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned SelectOut, + MachineRegisterInfo *MRI, + const SIInstrInfo *TII) { + if (MRT->isRegion()) { + RegionMRT *Region = MRT->getRegionMRT(); + Region->setBBSelectRegOut(SelectOut); + unsigned InnerSelectOut = createBBSelectReg(TII, MRI); + + // Fixme: Move linearization creation to the original spot + createLinearizedRegion(Region, SelectOut); + + for (auto CI = Region->getChildren()->begin(), + CE = Region->getChildren()->end(); + CI != CE; ++CI) { + InnerSelectOut = + initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII); + } + MRT->setBBSelectRegIn(InnerSelectOut); + return InnerSelectOut; + } else { + MRT->setBBSelectRegOut(SelectOut); + unsigned NewSelectIn = createBBSelectReg(TII, MRI); + MRT->setBBSelectRegIn(NewSelectIn); + return NewSelectIn; + } +} + +static void checkRegOnlyPHIInputs(MachineFunction &MF) { + for (auto &MBBI : MF) { + for (MachineBasicBlock::instr_iterator I = MBBI.instr_begin(), + E = MBBI.instr_end(); + I != E; ++I) { + MachineInstr &Instr = *I; + if (Instr.isPHI()) { + int numPreds = getPHINumInputs(Instr); + for (int i = 0; i < numPreds; ++i) { + assert(Instr.getOperand(i * 2 + 1).isReg() && + "PHI Operand not a register"); + } + } + } + } +} + + +INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer", + "AMDGPU Machine CFG Structurizer", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass) +INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer", + "AMDGPU Machine CFG Structurizer", false, false) + +char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID; + + +bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + MRI = &(MF.getRegInfo()); + initFallthroughMap(MF); + + checkRegOnlyPHIInputs(MF); + DEBUG(dbgs() << "----STRUCTURIZER START----\n"); + DEBUG(MF.dump()); + + Regions = &(getAnalysis<MachineRegionInfoPass>().getRegionInfo()); + DEBUG(Regions->dump()); + + RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI); + setRegionMRT(RTree); + initializeSelectRegisters(RTree, 0, MRI, TII); + DEBUG(RTree->dump(TRI)); + bool result = structurizeRegions(RTree, true); + delete RTree; + DEBUG(dbgs() << "----STRUCTURIZER END----\n"); + initFallthroughMap(MF); + return result; +} + +FunctionPass *llvm::createAMDGPUMachineCFGStructurizerPass() { + return new AMDGPUMachineCFGStructurizer(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 40c3327..9fb7f5f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -19,8 +19,8 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MaxKernArgAlign(0), LDSSize(0), ABIArgOffset(0), - IsKernel(MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_KERNEL || - MF.getFunction()->getCallingConv() == CallingConv::SPIR_KERNEL) { + IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction()->getCallingConv())), + NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) { // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, // except reserved size is not correctly aligned. } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 5d0640b..99bb61b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -10,8 +10,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/MachineFunction.h" namespace llvm { @@ -30,7 +30,11 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { /// Start of implicit kernel args unsigned ABIArgOffset; - bool IsKernel; + // Kernels + shaders. i.e. functions called by the driver and not not called + // by other functions. + bool IsEntryFunction; + + bool NoSignedZerosFPMath; public: AMDGPUMachineFunction(const MachineFunction &MF); @@ -66,8 +70,12 @@ public: return LDSSize; } - bool isKernel() const { - return IsKernel; + bool isEntryFunction() const { + return IsEntryFunction; + } + + bool hasNoSignedZerosFPMath() const { + return NoSignedZerosFPMath; } unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp new file mode 100644 index 0000000..7263ba7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp @@ -0,0 +1,64 @@ +//===--- AMDGPUMacroFusion.cpp - AMDGPU Macro Fusion ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU implementation of the DAG scheduling +/// mutation to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMacroFusion.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" + +#include "llvm/CodeGen/MacroFusion.h" + +using namespace llvm; + +namespace { + +/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const SIInstrInfo &TII = static_cast<const SIInstrInfo&>(TII_); + + switch (SecondMI.getOpcode()) { + case AMDGPU::V_ADDC_U32_e64: + case AMDGPU::V_SUBB_U32_e64: + case AMDGPU::V_CNDMASK_B32_e64: { + // Try to cluster defs of condition registers to their uses. This improves + // the chance VCC will be available which will allow shrinking to VOP2 + // encodings. + if (!FirstMI) + return true; + + const MachineOperand *Src2 = TII.getNamedOperand(SecondMI, + AMDGPU::OpName::src2); + return FirstMI->definesRegister(Src2->getReg()); + } + default: + return false; + } + + return false; +} + +} // end namespace + + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation () { + return createMacroFusionDAGMutation(shouldScheduleAdjacent); +} + +} // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h new file mode 100644 index 0000000..8449585 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h @@ -0,0 +1,19 @@ +//===- AMDGPUMacroFusion.h - AMDGPU Macro Fusion ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +/// Note that you have to add: +/// DAG.addMutation(createAMDGPUMacroFusionDAGMutation()); +/// to AMDGPUPassConfig::createMachineScheduler() to have an effect. +std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation(); + +} // llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h index 947d45b..71b9ab6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h @@ -19,12 +19,13 @@ namespace AMDGPU { -namespace PT_NOTE { +namespace ElfNote { const char SectionName[] = ".note"; const char NoteName[] = "AMD"; +// TODO: Move this enum to include/llvm/Support so it can be used in tools? enum NoteType{ NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1, NT_AMDGPU_HSA_HSAIL = 2, @@ -32,7 +33,7 @@ enum NoteType{ NT_AMDGPU_HSA_PRODUCER = 4, NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5, NT_AMDGPU_HSA_EXTENSION = 6, - NT_AMDGPU_HSA_RUNTIME_METADATA = 7, + NT_AMDGPU_HSA_CODE_OBJECT_METADATA = 10, NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101, NT_AMDGPU_HSA_HLDEBUG_TARGET = 102 }; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index baa28de..625c9b7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -14,12 +14,50 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <map> +#include <tuple> +#include <utility> +#include <vector> #define DEBUG_TYPE "amdgpu-promote-alloca" @@ -31,16 +69,16 @@ namespace { class AMDGPUPromoteAlloca : public FunctionPass { private: const TargetMachine *TM; - Module *Mod; - const DataLayout *DL; - MDNode *MaxWorkGroupSizeRange; + Module *Mod = nullptr; + const DataLayout *DL = nullptr; + AMDGPUAS AS; // FIXME: This should be per-kernel. - uint32_t LocalMemLimit; - uint32_t CurrentLocalMemUsage; + uint32_t LocalMemLimit = 0; + uint32_t CurrentLocalMemUsage = 0; - bool IsAMDGCN; - bool IsAMDHSA; + bool IsAMDGCN = false; + bool IsAMDHSA = false; std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder); Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); @@ -59,26 +97,20 @@ private: Instruction *UseInst, int OpIdx0, int OpIdx1) const; + /// Check whether we have enough local memory for promotion. + bool hasSufficientLocalMem(const Function &F); + public: static char ID; - AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) : - FunctionPass(ID), - TM(TM_), - Mod(nullptr), - DL(nullptr), - MaxWorkGroupSizeRange(nullptr), - LocalMemLimit(0), - CurrentLocalMemUsage(0), - IsAMDGCN(false), - IsAMDHSA(false) { } + AMDGPUPromoteAlloca() : FunctionPass(ID) {} bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } - void handleAlloca(AllocaInst &I); + bool handleAlloca(AllocaInst &I, bool SufficientLDS); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -86,146 +118,60 @@ public: } }; -} // End anonymous namespace +} // end anonymous namespace char AMDGPUPromoteAlloca::ID = 0; -INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, - "AMDGPU promote alloca to vector or LDS", false, false) +INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, + "AMDGPU promote alloca to vector or LDS", false, false) char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; - bool AMDGPUPromoteAlloca::doInitialization(Module &M) { - if (!TM) - return false; - Mod = &M; DL = &Mod->getDataLayout(); - // The maximum workitem id. - // - // FIXME: Should get as subtarget property. Usually runtime enforced max is - // 256. - MDBuilder MDB(Mod->getContext()); - MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048)); - - const Triple &TT = TM->getTargetTriple(); - - IsAMDGCN = TT.getArch() == Triple::amdgcn; - IsAMDHSA = TT.getOS() == Triple::AMDHSA; - return false; } bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { - if (!TM || skipFunction(F)) + if (skipFunction(F)) return false; - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); - if (!ST.isPromoteAllocaEnabled()) - return false; - - FunctionType *FTy = F.getFunctionType(); - - // If the function has any arguments in the local address space, then it's - // possible these arguments require the entire local memory space, so - // we cannot use local memory in the pass. - for (Type *ParamTy : FTy->params()) { - PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); - if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LocalMemLimit = 0; - DEBUG(dbgs() << "Function has local memory argument. Promoting to " - "local memory disabled.\n"); - return false; - } - } - - LocalMemLimit = ST.getLocalMemorySize(); - if (LocalMemLimit == 0) + if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) + TM = &TPC->getTM<TargetMachine>(); + else return false; - const DataLayout &DL = Mod->getDataLayout(); - - // Check how much local memory is being used by global objects - CurrentLocalMemUsage = 0; - for (GlobalVariable &GV : Mod->globals()) { - if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) - continue; - - for (const User *U : GV.users()) { - const Instruction *Use = dyn_cast<Instruction>(U); - if (!Use) - continue; - - if (Use->getParent()->getParent() == &F) { - unsigned Align = GV.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV.getValueType()); - - // FIXME: Try to account for padding here. The padding is currently - // determined from the inverse order of uses in the function. I'm not - // sure if the use list order is in any way connected to this, so the - // total reported size is likely incorrect. - uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); - CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); - CurrentLocalMemUsage += AllocSize; - break; - } - } - } - - unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage); - - // Restrict local memory usage so that we don't drastically reduce occupancy, - // unless it is already significantly reduced. - - // TODO: Have some sort of hint or other heuristics to guess occupancy based - // on other factors.. - unsigned OccupancyHint = ST.getWavesPerEU(F).second; - if (OccupancyHint == 0) - OccupancyHint = 7; - - // Clamp to max value. - OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); - - // Check the hint but ignore it if it's obviously wrong from the existing LDS - // usage. - MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); - - - // Round up to the next tier of usage. - unsigned MaxSizeWithWaveCount - = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy); + const Triple &TT = TM->getTargetTriple(); + IsAMDGCN = TT.getArch() == Triple::amdgcn; + IsAMDHSA = TT.getOS() == Triple::AMDHSA; - // Program is possibly broken by using more local mem than available. - if (CurrentLocalMemUsage > MaxSizeWithWaveCount) + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + if (!ST.isPromoteAllocaEnabled()) return false; - LocalMemLimit = MaxSizeWithWaveCount; - - DEBUG( - dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" - << " Rounding size to " << MaxSizeWithWaveCount - << " with a maximum occupancy of " << MaxOccupancy << '\n' - << " and " << (LocalMemLimit - CurrentLocalMemUsage) - << " available for promotion\n" - ); + AS = AMDGPU::getAMDGPUAS(*F.getParent()); + bool SufficientLDS = hasSufficientLocalMem(F); + bool Changed = false; BasicBlock &EntryBB = *F.begin(); for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) { AllocaInst *AI = dyn_cast<AllocaInst>(I); ++I; if (AI) - handleAlloca(*AI); + Changed |= handleAlloca(*AI, SufficientLDS); } - return true; + return Changed; } std::pair<Value *, Value *> AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>( + *Builder.GetInsertBlock()->getParent()); + if (!IsAMDHSA) { Function *LocalSizeYFn = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); @@ -235,8 +181,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); - LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); - LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + ST.makeLIDRangeMetadata(LocalSizeY); + ST.makeLIDRangeMetadata(LocalSizeZ); return std::make_pair(LocalSizeY, LocalSizeZ); } @@ -279,15 +225,15 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {}); - DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias); - DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); + DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); // Size of the dispatch packet struct. - DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64); + DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64); Type *I32Ty = Type::getInt32Ty(Mod->getContext()); Value *CastDispatchPtr = Builder.CreateBitCast( - DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS)); + DispatchPtr, PointerType::get(I32Ty, AS.CONSTANT_ADDRESS)); // We could do a single 64-bit load here, but it's likely that the basic // 32-bit and extract sequence is already present, and it is probably easier @@ -298,10 +244,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2); LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4); - MDNode *MD = llvm::MDNode::get(Mod->getContext(), None); + MDNode *MD = MDNode::get(Mod->getContext(), None); LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); - LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + ST.makeLIDRangeMetadata(LoadZU); // Extract y component. Upper half of LoadZU should be zero already. Value *Y = Builder.CreateLShr(LoadXY, 16); @@ -310,6 +256,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { } Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>( + *Builder.GetInsertBlock()->getParent()); Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic; switch (N) { @@ -332,7 +280,7 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID); CallInst *CI = Builder.CreateCall(WorkitemIdFn); - CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + ST.makeLIDRangeMetadata(CI); return CI; } @@ -369,29 +317,37 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { // instructions. static bool canVectorizeInst(Instruction *Inst, User *User) { switch (Inst->getOpcode()) { - case Instruction::Load: + case Instruction::Load: { + LoadInst *LI = cast<LoadInst>(Inst); + // Currently only handle the case where the Pointer Operand is a GEP so check for that case. + return isa<GetElementPtrInst>(LI->getPointerOperand()) && !LI->isVolatile(); + } case Instruction::BitCast: case Instruction::AddrSpaceCast: return true; case Instruction::Store: { - // Must be the stored pointer operand, not a stored value. + // Must be the stored pointer operand, not a stored value, plus + // since it should be canonical form, the User should be a GEP. StoreInst *SI = cast<StoreInst>(Inst); - return SI->getPointerOperand() == User; + return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && !SI->isVolatile(); } default: return false; } } -static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { +static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) { ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType()); DEBUG(dbgs() << "Alloca candidate for vectorization\n"); // FIXME: There is no reason why we can't support larger arrays, we // are just being conservative for now. + // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these + // could also be promoted but we don't currently handle this case if (!AllocaTy || AllocaTy->getElementType()->isVectorTy() || + AllocaTy->getElementType()->isArrayTy() || AllocaTy->getNumElements() > 4 || AllocaTy->getNumElements() < 2) { DEBUG(dbgs() << " Cannot convert type to vector\n"); @@ -438,8 +394,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); - Value *Ptr = Inst->getOperand(0); + Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); + Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); @@ -450,14 +406,15 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { break; } case Instruction::Store: { - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); + Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); - Value *Ptr = Inst->getOperand(1); + StoreInst *SI = cast<StoreInst>(Inst); + Value *Ptr = SI->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(BitCast); Value *NewVecValue = Builder.CreateInsertElement(VecValue, - Inst->getOperand(0), + SI->getValueOperand(), Index); Builder.CreateStore(NewVecValue, BitCast); Inst->eraseFromParent(); @@ -580,6 +537,9 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( } if (UseInst->getOpcode() == Instruction::AddrSpaceCast) { + // Give up if the pointer may be captured. + if (PointerMayBeCaptured(UseInst, true, true)) + return false; // Don't collect the users of this. WorkList.push_back(User); continue; @@ -626,12 +586,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( return true; } +bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { + + FunctionType *FTy = F.getFunctionType(); + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + + // If the function has any arguments in the local address space, then it's + // possible these arguments require the entire local memory space, so + // we cannot use local memory in the pass. + for (Type *ParamTy : FTy->params()) { + PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); + if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { + LocalMemLimit = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " + "local memory disabled.\n"); + return false; + } + } + + LocalMemLimit = ST.getLocalMemorySize(); + if (LocalMemLimit == 0) + return false; + + const DataLayout &DL = Mod->getDataLayout(); + + // Check how much local memory is being used by global objects + CurrentLocalMemUsage = 0; + for (GlobalVariable &GV : Mod->globals()) { + if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) + continue; + + for (const User *U : GV.users()) { + const Instruction *Use = dyn_cast<Instruction>(U); + if (!Use) + continue; + + if (Use->getParent()->getParent() == &F) { + unsigned Align = GV.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(GV.getValueType()); + + // FIXME: Try to account for padding here. The padding is currently + // determined from the inverse order of uses in the function. I'm not + // sure if the use list order is in any way connected to this, so the + // total reported size is likely incorrect. + uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); + CurrentLocalMemUsage += AllocSize; + break; + } + } + } + + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, + F); + + // Restrict local memory usage so that we don't drastically reduce occupancy, + // unless it is already significantly reduced. + + // TODO: Have some sort of hint or other heuristics to guess occupancy based + // on other factors.. + unsigned OccupancyHint = ST.getWavesPerEU(F).second; + if (OccupancyHint == 0) + OccupancyHint = 7; + + // Clamp to max value. + OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); + + // Check the hint but ignore it if it's obviously wrong from the existing LDS + // usage. + MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); + + + // Round up to the next tier of usage. + unsigned MaxSizeWithWaveCount + = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); + + // Program is possibly broken by using more local mem than available. + if (CurrentLocalMemUsage > MaxSizeWithWaveCount) + return false; + + LocalMemLimit = MaxSizeWithWaveCount; + + DEBUG( + dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" + << " Rounding size to " << MaxSizeWithWaveCount + << " with a maximum occupancy of " << MaxOccupancy << '\n' + << " and " << (LocalMemLimit - CurrentLocalMemUsage) + << " available for promotion\n" + ); + + return true; +} + // FIXME: Should try to pick the most likely to be profitable allocas first. -void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { +bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { // Array allocations are probably not worth handling, since an allocation of // the array type is the canonical form. if (!I.isStaticAlloca() || I.isArrayAllocation()) - return; + return false; IRBuilder<> Builder(&I); @@ -640,23 +693,30 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I)) { - DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); - return; - } + if (tryPromoteAllocaToVector(&I, AS)) + return true; // Promoted to vector. const Function &ContainingFunction = *I.getParent()->getParent(); + CallingConv::ID CC = ContainingFunction.getCallingConv(); // Don't promote the alloca to LDS for shader calling conventions as the work // item ID intrinsics are not supported for these calling conventions. // Furthermore not all LDS is available for some of the stages. - if (AMDGPU::isShader(ContainingFunction.getCallingConv())) - return; + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + break; + default: + DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n"); + return false; + } + + // Not likely to have sufficient local memory for promotion. + if (!SufficientLDS) + return false; const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction); - // FIXME: We should also try to get this value from the reqd_work_group_size - // function attribute if it is available. unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; const DataLayout &DL = Mod->getDataLayout(); @@ -678,7 +738,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (NewSize > LocalMemLimit) { DEBUG(dbgs() << " " << AllocSize << " bytes of local memory not available to promote\n"); - return; + return false; } CurrentLocalMemUsage = NewSize; @@ -687,7 +747,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); - return; + return false; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); @@ -701,7 +761,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { Twine(F->getName()) + Twine('.') + I.getName(), nullptr, GlobalVariable::NotThreadLocal, - AMDGPUAS::LOCAL_ADDRESS); + AS.LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); GV->setAlignment(I.getAlignment()); @@ -734,7 +794,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) { Value *Src0 = CI->getOperand(0); Type *EltTy = Src0->getType()->getPointerElementType(); - PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS); if (isa<ConstantPointerNull>(CI->getOperand(0))) CI->setOperand(0, ConstantPointerNull::get(NewTy)); @@ -751,7 +811,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { continue; Type *EltTy = V->getType()->getPointerElementType(); - PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS); // FIXME: It doesn't really make sense to try to do this for all // instructions. @@ -819,22 +879,23 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { Type *SrcTy = Src->getType()->getPointerElementType(); Function *ObjectSize = Intrinsic::getDeclaration(Mod, Intrinsic::objectsize, - { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) } + { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) } ); - CallInst *NewCall - = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) }); + CallInst *NewCall = Builder.CreateCall( + ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)}); Intr->replaceAllUsesWith(NewCall); Intr->eraseFromParent(); continue; } default: - Intr->dump(); + Intr->print(errs()); llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } + return true; } -FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) { - return new AMDGPUPromoteAlloca(TM); +FunctionPass *llvm::createAMDGPUPromoteAlloca() { + return new AMDGPUPromoteAlloca(); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp new file mode 100644 index 0000000..36d88f5 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp @@ -0,0 +1,353 @@ +//===-- AMDGPURegAsmNames.inc - Register asm names ----------*- C++ -*-----===// + +#ifdef AMDGPU_REG_ASM_NAMES + +static const char *const VGPR32RegNames[] = { + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31", "v32", "v33", "v34", "v35", + "v36", "v37", "v38", "v39", "v40", "v41", "v42", "v43", "v44", + "v45", "v46", "v47", "v48", "v49", "v50", "v51", "v52", "v53", + "v54", "v55", "v56", "v57", "v58", "v59", "v60", "v61", "v62", + "v63", "v64", "v65", "v66", "v67", "v68", "v69", "v70", "v71", + "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", "v80", + "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89", + "v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", + "v99", "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107", + "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", "v116", + "v117", "v118", "v119", "v120", "v121", "v122", "v123", "v124", "v125", + "v126", "v127", "v128", "v129", "v130", "v131", "v132", "v133", "v134", + "v135", "v136", "v137", "v138", "v139", "v140", "v141", "v142", "v143", + "v144", "v145", "v146", "v147", "v148", "v149", "v150", "v151", "v152", + "v153", "v154", "v155", "v156", "v157", "v158", "v159", "v160", "v161", + "v162", "v163", "v164", "v165", "v166", "v167", "v168", "v169", "v170", + "v171", "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179", + "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", "v188", + "v189", "v190", "v191", "v192", "v193", "v194", "v195", "v196", "v197", + "v198", "v199", "v200", "v201", "v202", "v203", "v204", "v205", "v206", + "v207", "v208", "v209", "v210", "v211", "v212", "v213", "v214", "v215", + "v216", "v217", "v218", "v219", "v220", "v221", "v222", "v223", "v224", + "v225", "v226", "v227", "v228", "v229", "v230", "v231", "v232", "v233", + "v234", "v235", "v236", "v237", "v238", "v239", "v240", "v241", "v242", + "v243", "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251", + "v252", "v253", "v254", "v255" +}; + +static const char *const SGPR32RegNames[] = { + "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", + "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", + "s20", "s21", "s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29", + "s30", "s31", "s32", "s33", "s34", "s35", "s36", "s37", "s38", "s39", + "s40", "s41", "s42", "s43", "s44", "s45", "s46", "s47", "s48", "s49", + "s50", "s51", "s52", "s53", "s54", "s55", "s56", "s57", "s58", "s59", + "s60", "s61", "s62", "s63", "s64", "s65", "s66", "s67", "s68", "s69", + "s70", "s71", "s72", "s73", "s74", "s75", "s76", "s77", "s78", "s79", + "s80", "s81", "s82", "s83", "s84", "s85", "s86", "s87", "s88", "s89", + "s90", "s91", "s92", "s93", "s94", "s95", "s96", "s97", "s98", "s99", + "s100", "s101", "s102", "s103" +}; + +static const char *const VGPR64RegNames[] = { + "v[0:1]", "v[1:2]", "v[2:3]", "v[3:4]", "v[4:5]", + "v[5:6]", "v[6:7]", "v[7:8]", "v[8:9]", "v[9:10]", + "v[10:11]", "v[11:12]", "v[12:13]", "v[13:14]", "v[14:15]", + "v[15:16]", "v[16:17]", "v[17:18]", "v[18:19]", "v[19:20]", + "v[20:21]", "v[21:22]", "v[22:23]", "v[23:24]", "v[24:25]", + "v[25:26]", "v[26:27]", "v[27:28]", "v[28:29]", "v[29:30]", + "v[30:31]", "v[31:32]", "v[32:33]", "v[33:34]", "v[34:35]", + "v[35:36]", "v[36:37]", "v[37:38]", "v[38:39]", "v[39:40]", + "v[40:41]", "v[41:42]", "v[42:43]", "v[43:44]", "v[44:45]", + "v[45:46]", "v[46:47]", "v[47:48]", "v[48:49]", "v[49:50]", + "v[50:51]", "v[51:52]", "v[52:53]", "v[53:54]", "v[54:55]", + "v[55:56]", "v[56:57]", "v[57:58]", "v[58:59]", "v[59:60]", + "v[60:61]", "v[61:62]", "v[62:63]", "v[63:64]", "v[64:65]", + "v[65:66]", "v[66:67]", "v[67:68]", "v[68:69]", "v[69:70]", + "v[70:71]", "v[71:72]", "v[72:73]", "v[73:74]", "v[74:75]", + "v[75:76]", "v[76:77]", "v[77:78]", "v[78:79]", "v[79:80]", + "v[80:81]", "v[81:82]", "v[82:83]", "v[83:84]", "v[84:85]", + "v[85:86]", "v[86:87]", "v[87:88]", "v[88:89]", "v[89:90]", + "v[90:91]", "v[91:92]", "v[92:93]", "v[93:94]", "v[94:95]", + "v[95:96]", "v[96:97]", "v[97:98]", "v[98:99]", "v[99:100]", + "v[100:101]", "v[101:102]", "v[102:103]", "v[103:104]", "v[104:105]", + "v[105:106]", "v[106:107]", "v[107:108]", "v[108:109]", "v[109:110]", + "v[110:111]", "v[111:112]", "v[112:113]", "v[113:114]", "v[114:115]", + "v[115:116]", "v[116:117]", "v[117:118]", "v[118:119]", "v[119:120]", + "v[120:121]", "v[121:122]", "v[122:123]", "v[123:124]", "v[124:125]", + "v[125:126]", "v[126:127]", "v[127:128]", "v[128:129]", "v[129:130]", + "v[130:131]", "v[131:132]", "v[132:133]", "v[133:134]", "v[134:135]", + "v[135:136]", "v[136:137]", "v[137:138]", "v[138:139]", "v[139:140]", + "v[140:141]", "v[141:142]", "v[142:143]", "v[143:144]", "v[144:145]", + "v[145:146]", "v[146:147]", "v[147:148]", "v[148:149]", "v[149:150]", + "v[150:151]", "v[151:152]", "v[152:153]", "v[153:154]", "v[154:155]", + "v[155:156]", "v[156:157]", "v[157:158]", "v[158:159]", "v[159:160]", + "v[160:161]", "v[161:162]", "v[162:163]", "v[163:164]", "v[164:165]", + "v[165:166]", "v[166:167]", "v[167:168]", "v[168:169]", "v[169:170]", + "v[170:171]", "v[171:172]", "v[172:173]", "v[173:174]", "v[174:175]", + "v[175:176]", "v[176:177]", "v[177:178]", "v[178:179]", "v[179:180]", + "v[180:181]", "v[181:182]", "v[182:183]", "v[183:184]", "v[184:185]", + "v[185:186]", "v[186:187]", "v[187:188]", "v[188:189]", "v[189:190]", + "v[190:191]", "v[191:192]", "v[192:193]", "v[193:194]", "v[194:195]", + "v[195:196]", "v[196:197]", "v[197:198]", "v[198:199]", "v[199:200]", + "v[200:201]", "v[201:202]", "v[202:203]", "v[203:204]", "v[204:205]", + "v[205:206]", "v[206:207]", "v[207:208]", "v[208:209]", "v[209:210]", + "v[210:211]", "v[211:212]", "v[212:213]", "v[213:214]", "v[214:215]", + "v[215:216]", "v[216:217]", "v[217:218]", "v[218:219]", "v[219:220]", + "v[220:221]", "v[221:222]", "v[222:223]", "v[223:224]", "v[224:225]", + "v[225:226]", "v[226:227]", "v[227:228]", "v[228:229]", "v[229:230]", + "v[230:231]", "v[231:232]", "v[232:233]", "v[233:234]", "v[234:235]", + "v[235:236]", "v[236:237]", "v[237:238]", "v[238:239]", "v[239:240]", + "v[240:241]", "v[241:242]", "v[242:243]", "v[243:244]", "v[244:245]", + "v[245:246]", "v[246:247]", "v[247:248]", "v[248:249]", "v[249:250]", + "v[250:251]", "v[251:252]", "v[252:253]", "v[253:254]", "v[254:255]" +}; + +static const char *const VGPR96RegNames[] = { + "v[0:2]", "v[1:3]", "v[2:4]", "v[3:5]", "v[4:6]", + "v[5:7]", "v[6:8]", "v[7:9]", "v[8:10]", "v[9:11]", + "v[10:12]", "v[11:13]", "v[12:14]", "v[13:15]", "v[14:16]", + "v[15:17]", "v[16:18]", "v[17:19]", "v[18:20]", "v[19:21]", + "v[20:22]", "v[21:23]", "v[22:24]", "v[23:25]", "v[24:26]", + "v[25:27]", "v[26:28]", "v[27:29]", "v[28:30]", "v[29:31]", + "v[30:32]", "v[31:33]", "v[32:34]", "v[33:35]", "v[34:36]", + "v[35:37]", "v[36:38]", "v[37:39]", "v[38:40]", "v[39:41]", + "v[40:42]", "v[41:43]", "v[42:44]", "v[43:45]", "v[44:46]", + "v[45:47]", "v[46:48]", "v[47:49]", "v[48:50]", "v[49:51]", + "v[50:52]", "v[51:53]", "v[52:54]", "v[53:55]", "v[54:56]", + "v[55:57]", "v[56:58]", "v[57:59]", "v[58:60]", "v[59:61]", + "v[60:62]", "v[61:63]", "v[62:64]", "v[63:65]", "v[64:66]", + "v[65:67]", "v[66:68]", "v[67:69]", "v[68:70]", "v[69:71]", + "v[70:72]", "v[71:73]", "v[72:74]", "v[73:75]", "v[74:76]", + "v[75:77]", "v[76:78]", "v[77:79]", "v[78:80]", "v[79:81]", + "v[80:82]", "v[81:83]", "v[82:84]", "v[83:85]", "v[84:86]", + "v[85:87]", "v[86:88]", "v[87:89]", "v[88:90]", "v[89:91]", + "v[90:92]", "v[91:93]", "v[92:94]", "v[93:95]", "v[94:96]", + "v[95:97]", "v[96:98]", "v[97:99]", "v[98:100]", "v[99:101]", + "v[100:102]", "v[101:103]", "v[102:104]", "v[103:105]", "v[104:106]", + "v[105:107]", "v[106:108]", "v[107:109]", "v[108:110]", "v[109:111]", + "v[110:112]", "v[111:113]", "v[112:114]", "v[113:115]", "v[114:116]", + "v[115:117]", "v[116:118]", "v[117:119]", "v[118:120]", "v[119:121]", + "v[120:122]", "v[121:123]", "v[122:124]", "v[123:125]", "v[124:126]", + "v[125:127]", "v[126:128]", "v[127:129]", "v[128:130]", "v[129:131]", + "v[130:132]", "v[131:133]", "v[132:134]", "v[133:135]", "v[134:136]", + "v[135:137]", "v[136:138]", "v[137:139]", "v[138:140]", "v[139:141]", + "v[140:142]", "v[141:143]", "v[142:144]", "v[143:145]", "v[144:146]", + "v[145:147]", "v[146:148]", "v[147:149]", "v[148:150]", "v[149:151]", + "v[150:152]", "v[151:153]", "v[152:154]", "v[153:155]", "v[154:156]", + "v[155:157]", "v[156:158]", "v[157:159]", "v[158:160]", "v[159:161]", + "v[160:162]", "v[161:163]", "v[162:164]", "v[163:165]", "v[164:166]", + "v[165:167]", "v[166:168]", "v[167:169]", "v[168:170]", "v[169:171]", + "v[170:172]", "v[171:173]", "v[172:174]", "v[173:175]", "v[174:176]", + "v[175:177]", "v[176:178]", "v[177:179]", "v[178:180]", "v[179:181]", + "v[180:182]", "v[181:183]", "v[182:184]", "v[183:185]", "v[184:186]", + "v[185:187]", "v[186:188]", "v[187:189]", "v[188:190]", "v[189:191]", + "v[190:192]", "v[191:193]", "v[192:194]", "v[193:195]", "v[194:196]", + "v[195:197]", "v[196:198]", "v[197:199]", "v[198:200]", "v[199:201]", + "v[200:202]", "v[201:203]", "v[202:204]", "v[203:205]", "v[204:206]", + "v[205:207]", "v[206:208]", "v[207:209]", "v[208:210]", "v[209:211]", + "v[210:212]", "v[211:213]", "v[212:214]", "v[213:215]", "v[214:216]", + "v[215:217]", "v[216:218]", "v[217:219]", "v[218:220]", "v[219:221]", + "v[220:222]", "v[221:223]", "v[222:224]", "v[223:225]", "v[224:226]", + "v[225:227]", "v[226:228]", "v[227:229]", "v[228:230]", "v[229:231]", + "v[230:232]", "v[231:233]", "v[232:234]", "v[233:235]", "v[234:236]", + "v[235:237]", "v[236:238]", "v[237:239]", "v[238:240]", "v[239:241]", + "v[240:242]", "v[241:243]", "v[242:244]", "v[243:245]", "v[244:246]", + "v[245:247]", "v[246:248]", "v[247:249]", "v[248:250]", "v[249:251]", + "v[250:252]", "v[251:253]", "v[252:254]", "v[253:255]" +}; + +static const char *const VGPR128RegNames[] = { + "v[0:3]", "v[1:4]", "v[2:5]", "v[3:6]", "v[4:7]", + "v[5:8]", "v[6:9]", "v[7:10]", "v[8:11]", "v[9:12]", + "v[10:13]", "v[11:14]", "v[12:15]", "v[13:16]", "v[14:17]", + "v[15:18]", "v[16:19]", "v[17:20]", "v[18:21]", "v[19:22]", + "v[20:23]", "v[21:24]", "v[22:25]", "v[23:26]", "v[24:27]", + "v[25:28]", "v[26:29]", "v[27:30]", "v[28:31]", "v[29:32]", + "v[30:33]", "v[31:34]", "v[32:35]", "v[33:36]", "v[34:37]", + "v[35:38]", "v[36:39]", "v[37:40]", "v[38:41]", "v[39:42]", + "v[40:43]", "v[41:44]", "v[42:45]", "v[43:46]", "v[44:47]", + "v[45:48]", "v[46:49]", "v[47:50]", "v[48:51]", "v[49:52]", + "v[50:53]", "v[51:54]", "v[52:55]", "v[53:56]", "v[54:57]", + "v[55:58]", "v[56:59]", "v[57:60]", "v[58:61]", "v[59:62]", + "v[60:63]", "v[61:64]", "v[62:65]", "v[63:66]", "v[64:67]", + "v[65:68]", "v[66:69]", "v[67:70]", "v[68:71]", "v[69:72]", + "v[70:73]", "v[71:74]", "v[72:75]", "v[73:76]", "v[74:77]", + "v[75:78]", "v[76:79]", "v[77:80]", "v[78:81]", "v[79:82]", + "v[80:83]", "v[81:84]", "v[82:85]", "v[83:86]", "v[84:87]", + "v[85:88]", "v[86:89]", "v[87:90]", "v[88:91]", "v[89:92]", + "v[90:93]", "v[91:94]", "v[92:95]", "v[93:96]", "v[94:97]", + "v[95:98]", "v[96:99]", "v[97:100]", "v[98:101]", "v[99:102]", + "v[100:103]", "v[101:104]", "v[102:105]", "v[103:106]", "v[104:107]", + "v[105:108]", "v[106:109]", "v[107:110]", "v[108:111]", "v[109:112]", + "v[110:113]", "v[111:114]", "v[112:115]", "v[113:116]", "v[114:117]", + "v[115:118]", "v[116:119]", "v[117:120]", "v[118:121]", "v[119:122]", + "v[120:123]", "v[121:124]", "v[122:125]", "v[123:126]", "v[124:127]", + "v[125:128]", "v[126:129]", "v[127:130]", "v[128:131]", "v[129:132]", + "v[130:133]", "v[131:134]", "v[132:135]", "v[133:136]", "v[134:137]", + "v[135:138]", "v[136:139]", "v[137:140]", "v[138:141]", "v[139:142]", + "v[140:143]", "v[141:144]", "v[142:145]", "v[143:146]", "v[144:147]", + "v[145:148]", "v[146:149]", "v[147:150]", "v[148:151]", "v[149:152]", + "v[150:153]", "v[151:154]", "v[152:155]", "v[153:156]", "v[154:157]", + "v[155:158]", "v[156:159]", "v[157:160]", "v[158:161]", "v[159:162]", + "v[160:163]", "v[161:164]", "v[162:165]", "v[163:166]", "v[164:167]", + "v[165:168]", "v[166:169]", "v[167:170]", "v[168:171]", "v[169:172]", + "v[170:173]", "v[171:174]", "v[172:175]", "v[173:176]", "v[174:177]", + "v[175:178]", "v[176:179]", "v[177:180]", "v[178:181]", "v[179:182]", + "v[180:183]", "v[181:184]", "v[182:185]", "v[183:186]", "v[184:187]", + "v[185:188]", "v[186:189]", "v[187:190]", "v[188:191]", "v[189:192]", + "v[190:193]", "v[191:194]", "v[192:195]", "v[193:196]", "v[194:197]", + "v[195:198]", "v[196:199]", "v[197:200]", "v[198:201]", "v[199:202]", + "v[200:203]", "v[201:204]", "v[202:205]", "v[203:206]", "v[204:207]", + "v[205:208]", "v[206:209]", "v[207:210]", "v[208:211]", "v[209:212]", + "v[210:213]", "v[211:214]", "v[212:215]", "v[213:216]", "v[214:217]", + "v[215:218]", "v[216:219]", "v[217:220]", "v[218:221]", "v[219:222]", + "v[220:223]", "v[221:224]", "v[222:225]", "v[223:226]", "v[224:227]", + "v[225:228]", "v[226:229]", "v[227:230]", "v[228:231]", "v[229:232]", + "v[230:233]", "v[231:234]", "v[232:235]", "v[233:236]", "v[234:237]", + "v[235:238]", "v[236:239]", "v[237:240]", "v[238:241]", "v[239:242]", + "v[240:243]", "v[241:244]", "v[242:245]", "v[243:246]", "v[244:247]", + "v[245:248]", "v[246:249]", "v[247:250]", "v[248:251]", "v[249:252]", + "v[250:253]", "v[251:254]", "v[252:255]" +}; + +static const char *const VGPR256RegNames[] = { + "v[0:7]", "v[1:8]", "v[2:9]", "v[3:10]", "v[4:11]", + "v[5:12]", "v[6:13]", "v[7:14]", "v[8:15]", "v[9:16]", + "v[10:17]", "v[11:18]", "v[12:19]", "v[13:20]", "v[14:21]", + "v[15:22]", "v[16:23]", "v[17:24]", "v[18:25]", "v[19:26]", + "v[20:27]", "v[21:28]", "v[22:29]", "v[23:30]", "v[24:31]", + "v[25:32]", "v[26:33]", "v[27:34]", "v[28:35]", "v[29:36]", + "v[30:37]", "v[31:38]", "v[32:39]", "v[33:40]", "v[34:41]", + "v[35:42]", "v[36:43]", "v[37:44]", "v[38:45]", "v[39:46]", + "v[40:47]", "v[41:48]", "v[42:49]", "v[43:50]", "v[44:51]", + "v[45:52]", "v[46:53]", "v[47:54]", "v[48:55]", "v[49:56]", + "v[50:57]", "v[51:58]", "v[52:59]", "v[53:60]", "v[54:61]", + "v[55:62]", "v[56:63]", "v[57:64]", "v[58:65]", "v[59:66]", + "v[60:67]", "v[61:68]", "v[62:69]", "v[63:70]", "v[64:71]", + "v[65:72]", "v[66:73]", "v[67:74]", "v[68:75]", "v[69:76]", + "v[70:77]", "v[71:78]", "v[72:79]", "v[73:80]", "v[74:81]", + "v[75:82]", "v[76:83]", "v[77:84]", "v[78:85]", "v[79:86]", + "v[80:87]", "v[81:88]", "v[82:89]", "v[83:90]", "v[84:91]", + "v[85:92]", "v[86:93]", "v[87:94]", "v[88:95]", "v[89:96]", + "v[90:97]", "v[91:98]", "v[92:99]", "v[93:100]", "v[94:101]", + "v[95:102]", "v[96:103]", "v[97:104]", "v[98:105]", "v[99:106]", + "v[100:107]", "v[101:108]", "v[102:109]", "v[103:110]", "v[104:111]", + "v[105:112]", "v[106:113]", "v[107:114]", "v[108:115]", "v[109:116]", + "v[110:117]", "v[111:118]", "v[112:119]", "v[113:120]", "v[114:121]", + "v[115:122]", "v[116:123]", "v[117:124]", "v[118:125]", "v[119:126]", + "v[120:127]", "v[121:128]", "v[122:129]", "v[123:130]", "v[124:131]", + "v[125:132]", "v[126:133]", "v[127:134]", "v[128:135]", "v[129:136]", + "v[130:137]", "v[131:138]", "v[132:139]", "v[133:140]", "v[134:141]", + "v[135:142]", "v[136:143]", "v[137:144]", "v[138:145]", "v[139:146]", + "v[140:147]", "v[141:148]", "v[142:149]", "v[143:150]", "v[144:151]", + "v[145:152]", "v[146:153]", "v[147:154]", "v[148:155]", "v[149:156]", + "v[150:157]", "v[151:158]", "v[152:159]", "v[153:160]", "v[154:161]", + "v[155:162]", "v[156:163]", "v[157:164]", "v[158:165]", "v[159:166]", + "v[160:167]", "v[161:168]", "v[162:169]", "v[163:170]", "v[164:171]", + "v[165:172]", "v[166:173]", "v[167:174]", "v[168:175]", "v[169:176]", + "v[170:177]", "v[171:178]", "v[172:179]", "v[173:180]", "v[174:181]", + "v[175:182]", "v[176:183]", "v[177:184]", "v[178:185]", "v[179:186]", + "v[180:187]", "v[181:188]", "v[182:189]", "v[183:190]", "v[184:191]", + "v[185:192]", "v[186:193]", "v[187:194]", "v[188:195]", "v[189:196]", + "v[190:197]", "v[191:198]", "v[192:199]", "v[193:200]", "v[194:201]", + "v[195:202]", "v[196:203]", "v[197:204]", "v[198:205]", "v[199:206]", + "v[200:207]", "v[201:208]", "v[202:209]", "v[203:210]", "v[204:211]", + "v[205:212]", "v[206:213]", "v[207:214]", "v[208:215]", "v[209:216]", + "v[210:217]", "v[211:218]", "v[212:219]", "v[213:220]", "v[214:221]", + "v[215:222]", "v[216:223]", "v[217:224]", "v[218:225]", "v[219:226]", + "v[220:227]", "v[221:228]", "v[222:229]", "v[223:230]", "v[224:231]", + "v[225:232]", "v[226:233]", "v[227:234]", "v[228:235]", "v[229:236]", + "v[230:237]", "v[231:238]", "v[232:239]", "v[233:240]", "v[234:241]", + "v[235:242]", "v[236:243]", "v[237:244]", "v[238:245]", "v[239:246]", + "v[240:247]", "v[241:248]", "v[242:249]", "v[243:250]", "v[244:251]", + "v[245:252]", "v[246:253]", "v[247:254]", "v[248:255]" +}; + +static const char *const VGPR512RegNames[] = { + "v[0:15]", "v[1:16]", "v[2:17]", "v[3:18]", "v[4:19]", + "v[5:20]", "v[6:21]", "v[7:22]", "v[8:23]", "v[9:24]", + "v[10:25]", "v[11:26]", "v[12:27]", "v[13:28]", "v[14:29]", + "v[15:30]", "v[16:31]", "v[17:32]", "v[18:33]", "v[19:34]", + "v[20:35]", "v[21:36]", "v[22:37]", "v[23:38]", "v[24:39]", + "v[25:40]", "v[26:41]", "v[27:42]", "v[28:43]", "v[29:44]", + "v[30:45]", "v[31:46]", "v[32:47]", "v[33:48]", "v[34:49]", + "v[35:50]", "v[36:51]", "v[37:52]", "v[38:53]", "v[39:54]", + "v[40:55]", "v[41:56]", "v[42:57]", "v[43:58]", "v[44:59]", + "v[45:60]", "v[46:61]", "v[47:62]", "v[48:63]", "v[49:64]", + "v[50:65]", "v[51:66]", "v[52:67]", "v[53:68]", "v[54:69]", + "v[55:70]", "v[56:71]", "v[57:72]", "v[58:73]", "v[59:74]", + "v[60:75]", "v[61:76]", "v[62:77]", "v[63:78]", "v[64:79]", + "v[65:80]", "v[66:81]", "v[67:82]", "v[68:83]", "v[69:84]", + "v[70:85]", "v[71:86]", "v[72:87]", "v[73:88]", "v[74:89]", + "v[75:90]", "v[76:91]", "v[77:92]", "v[78:93]", "v[79:94]", + "v[80:95]", "v[81:96]", "v[82:97]", "v[83:98]", "v[84:99]", + "v[85:100]", "v[86:101]", "v[87:102]", "v[88:103]", "v[89:104]", + "v[90:105]", "v[91:106]", "v[92:107]", "v[93:108]", "v[94:109]", + "v[95:110]", "v[96:111]", "v[97:112]", "v[98:113]", "v[99:114]", + "v[100:115]", "v[101:116]", "v[102:117]", "v[103:118]", "v[104:119]", + "v[105:120]", "v[106:121]", "v[107:122]", "v[108:123]", "v[109:124]", + "v[110:125]", "v[111:126]", "v[112:127]", "v[113:128]", "v[114:129]", + "v[115:130]", "v[116:131]", "v[117:132]", "v[118:133]", "v[119:134]", + "v[120:135]", "v[121:136]", "v[122:137]", "v[123:138]", "v[124:139]", + "v[125:140]", "v[126:141]", "v[127:142]", "v[128:143]", "v[129:144]", + "v[130:145]", "v[131:146]", "v[132:147]", "v[133:148]", "v[134:149]", + "v[135:150]", "v[136:151]", "v[137:152]", "v[138:153]", "v[139:154]", + "v[140:155]", "v[141:156]", "v[142:157]", "v[143:158]", "v[144:159]", + "v[145:160]", "v[146:161]", "v[147:162]", "v[148:163]", "v[149:164]", + "v[150:165]", "v[151:166]", "v[152:167]", "v[153:168]", "v[154:169]", + "v[155:170]", "v[156:171]", "v[157:172]", "v[158:173]", "v[159:174]", + "v[160:175]", "v[161:176]", "v[162:177]", "v[163:178]", "v[164:179]", + "v[165:180]", "v[166:181]", "v[167:182]", "v[168:183]", "v[169:184]", + "v[170:185]", "v[171:186]", "v[172:187]", "v[173:188]", "v[174:189]", + "v[175:190]", "v[176:191]", "v[177:192]", "v[178:193]", "v[179:194]", + "v[180:195]", "v[181:196]", "v[182:197]", "v[183:198]", "v[184:199]", + "v[185:200]", "v[186:201]", "v[187:202]", "v[188:203]", "v[189:204]", + "v[190:205]", "v[191:206]", "v[192:207]", "v[193:208]", "v[194:209]", + "v[195:210]", "v[196:211]", "v[197:212]", "v[198:213]", "v[199:214]", + "v[200:215]", "v[201:216]", "v[202:217]", "v[203:218]", "v[204:219]", + "v[205:220]", "v[206:221]", "v[207:222]", "v[208:223]", "v[209:224]", + "v[210:225]", "v[211:226]", "v[212:227]", "v[213:228]", "v[214:229]", + "v[215:230]", "v[216:231]", "v[217:232]", "v[218:233]", "v[219:234]", + "v[220:235]", "v[221:236]", "v[222:237]", "v[223:238]", "v[224:239]", + "v[225:240]", "v[226:241]", "v[227:242]", "v[228:243]", "v[229:244]", + "v[230:245]", "v[231:246]", "v[232:247]", "v[233:248]", "v[234:249]", + "v[235:250]", "v[236:251]", "v[237:252]", "v[238:253]", "v[239:254]", + "v[240:255]" +}; + +static const char *const SGPR64RegNames[] = { + "s[0:1]", "s[2:3]", "s[4:5]", "s[6:7]", "s[8:9]", "s[10:11]", + "s[12:13]", "s[14:15]", "s[16:17]", "s[18:19]", "s[20:21]", "s[22:23]", + "s[24:25]", "s[26:27]", "s[28:29]", "s[30:31]", "s[32:33]", "s[34:35]", + "s[36:37]", "s[38:39]", "s[40:41]", "s[42:43]", "s[44:45]", "s[46:47]", + "s[48:49]", "s[50:51]", "s[52:53]", "s[54:55]", "s[56:57]", "s[58:59]", + "s[60:61]", "s[62:63]", "s[64:65]", "s[66:67]", "s[68:69]", "s[70:71]", + "s[72:73]", "s[74:75]", "s[76:77]", "s[78:79]", "s[80:81]", "s[82:83]", + "s[84:85]", "s[86:87]", "s[88:89]", "s[90:91]", "s[92:93]", "s[94:95]", + "s[96:97]", "s[98:99]", "s[100:101]", "s[102:103]" +}; + +static const char *const SGPR128RegNames[] = { + "s[0:3]", "s[4:7]", "s[8:11]", "s[12:15]", "s[16:19]", "s[20:23]", + "s[24:27]", "s[28:31]", "s[32:35]", "s[36:39]", "s[40:43]", "s[44:47]", + "s[48:51]", "s[52:55]", "s[56:59]", "s[60:63]", "s[64:67]", "s[68:71]", + "s[72:75]", "s[76:79]", "s[80:83]", "s[84:87]", "s[88:91]", "s[92:95]", + "s[96:99]", "s[100:103]" +}; + +static const char *const SGPR256RegNames[] = { + "s[0:7]", "s[4:11]", "s[8:15]", "s[12:19]", "s[16:23]", + "s[20:27]", "s[24:31]", "s[28:35]", "s[32:39]", "s[36:43]", + "s[40:47]", "s[44:51]", "s[48:55]", "s[52:59]", "s[56:63]", + "s[60:67]", "s[64:71]", "s[68:75]", "s[72:79]", "s[76:83]", + "s[80:87]", "s[84:91]", "s[88:95]", "s[92:99]", "s[96:103]" +}; + +static const char *const SGPR512RegNames[] = { + "s[0:15]", "s[4:19]", "s[8:23]", "s[12:27]", "s[16:31]", "s[20:35]", + "s[24:39]", "s[28:43]", "s[32:47]", "s[36:51]", "s[40:55]", "s[44:59]", + "s[48:63]", "s[52:67]", "s[56:71]", "s[60:75]", "s[64:79]", "s[68:83]", + "s[72:87]", "s[76:91]", "s[80:95]", "s[84:99]", "s[88:103]" +}; + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp new file mode 100644 index 0000000..623b2c8 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -0,0 +1,231 @@ +//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the RegisterBankInfo class for +/// AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AMDGPURegisterBankInfo.h" +#include "AMDGPUInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +#define GET_TARGET_REGBANK_IMPL +#include "AMDGPUGenRegisterBank.inc" + +// This file will be TableGen'ed at some point. +#include "AMDGPUGenRegisterBankInfo.def" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) + : AMDGPUGenRegisterBankInfo(), + TRI(static_cast<const SIRegisterInfo*>(&TRI)) { + + // HACK: Until this is fully tablegen'd + static bool AlreadyInit = false; + if (AlreadyInit) + return; + + AlreadyInit = true; + + const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID); + (void)RBSGPR; + assert(&RBSGPR == &AMDGPU::SGPRRegBank); + + const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID); + (void)RBVGPR; + assert(&RBVGPR == &AMDGPU::VGPRRegBank); + +} + +unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &A, + const RegisterBank &B, + unsigned Size) const { + return RegisterBankInfo::copyCost(A, B, Size); +} + +const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( + const TargetRegisterClass &RC) const { + + if (TRI->isSGPRClass(&RC)) + return getRegBank(AMDGPU::SGPRRegBankID); + + return getRegBank(AMDGPU::VGPRRegBankID); +} + +RegisterBankInfo::InstructionMappings +AMDGPURegisterBankInfo::getInstrAlternativeMappings( + const MachineInstr &MI) const { + + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + + InstructionMappings AltMappings; + switch (MI.getOpcode()) { + case TargetOpcode::G_LOAD: { + // FIXME: Should we be hard coding the size for these mappings? + const InstructionMapping &SSMapping = getInstructionMapping( + 1, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.push_back(&SSMapping); + + const InstructionMapping &VVMapping = getInstructionMapping( + 2, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.push_back(&VVMapping); + + // FIXME: Should this be the pointer-size (64-bits) or the size of the + // register that will hold the bufffer resourc (128-bits). + const InstructionMapping &VSMapping = getInstructionMapping( + 3, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.push_back(&VSMapping); + + return AltMappings; + + } + default: + break; + } + return RegisterBankInfo::getInstrAlternativeMappings(MI); +} + +void AMDGPURegisterBankInfo::applyMappingImpl( + const OperandsMapper &OpdMapper) const { + return applyDefaultMapping(OpdMapper); +} + +static bool isInstrUniform(const MachineInstr &MI) { + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI.memoperands_begin(); + return AMDGPU::isUniformMMO(MMO); +} + +const RegisterBankInfo::InstructionMapping & +AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { + + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + + const ValueMapping *ValMapping; + const ValueMapping *PtrMapping; + + if (isInstrUniform(MI)) { + // We have a uniform instruction so we want to use an SMRD load + ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); + } else { + ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + // FIXME: What would happen if we used SGPRRegBankID here? + PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); + } + + OpdsMapping[0] = ValMapping; + OpdsMapping[1] = PtrMapping; + const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( + 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); + return Mapping; + + // FIXME: Do we want to add a mapping for FLAT load, or should we just + // handle that during instruction selection? +} + +const RegisterBankInfo::InstructionMapping & +AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { + const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); + + if (Mapping.isValid()) + return Mapping; + + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); + + bool IsComplete = true; + switch (MI.getOpcode()) { + default: + IsComplete = false; + break; + case AMDGPU::G_CONSTANT: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } + case AMDGPU::G_GEP: { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isReg()) + continue; + + unsigned Size = MRI.getType(MI.getOperand(i).getReg()).getSizeInBits(); + OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + } + break; + } + case AMDGPU::G_STORE: { + assert(MI.getOperand(0).isReg()); + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + // FIXME: We need to specify a different reg bank once scalar stores + // are supported. + const ValueMapping *ValMapping = + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + // FIXME: Depending on the type of store, the pointer could be in + // the SGPR Reg bank. + // FIXME: Pointer size should be based on the address space. + const ValueMapping *PtrMapping = + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); + + OpdsMapping[0] = ValMapping; + OpdsMapping[1] = PtrMapping; + break; + } + + case AMDGPU::G_LOAD: + return getInstrMappingForLoad(MI); + } + + if (!IsComplete) { + unsigned BankID = AMDGPU::SGPRRegBankID; + + unsigned Size = 0; + for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) { + // If the operand is not a register default to the size of the previous + // operand. + // FIXME: Can't we pull the types from the MachineInstr rather than the + // operands. + if (MI.getOperand(Idx).isReg()) + Size = getSizeInBits(MI.getOperand(Idx).getReg(), MRI, *TRI); + OpdsMapping.push_back(AMDGPU::getValueMapping(BankID, Size)); + } + } + return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), + MI.getNumOperands()); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h new file mode 100644 index 0000000..201fdc1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -0,0 +1,65 @@ +//===- AMDGPURegisterBankInfo -----------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the RegisterBankInfo class for AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H + +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" + +namespace llvm { + +class SIRegisterInfo; +class TargetRegisterInfo; + +namespace AMDGPU { +enum { + SGPRRegBankID = 0, + VGPRRegBankID = 1, + NumRegisterBanks +}; +} // End AMDGPU namespace. + +/// This class provides the information for the target register banks. +class AMDGPUGenRegisterBankInfo : public RegisterBankInfo { + +protected: + +#define GET_TARGET_REGBANK_CLASS +#include "AMDGPUGenRegisterBank.inc" +}; +class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { + const SIRegisterInfo *TRI; + + /// See RegisterBankInfo::applyMapping. + void applyMappingImpl(const OperandsMapper &OpdMapper) const override; + + const RegisterBankInfo::InstructionMapping & + getInstrMappingForLoad(const MachineInstr &MI) const; + +public: + AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI); + + unsigned copyCost(const RegisterBank &A, const RegisterBank &B, + unsigned Size) const override; + + const RegisterBank & + getRegBankFromRegClass(const TargetRegisterClass &RC) const override; + + InstructionMappings + getInstrAlternativeMappings(const MachineInstr &MI) const override; + + const InstructionMapping & + getInstrMapping(const MachineInstr &MI) const override; +}; +} // End llvm namespace. +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td new file mode 100644 index 0000000..f4428e5 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -0,0 +1,16 @@ +//=- AMDGPURegisterBank.td - Describe the AMDGPU Banks -------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def SGPRRegBank : RegisterBank<"SGPR", + [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512] +>; + +def VGPRRegBank : RegisterBank<"VGPR", + [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512] +>; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index 941f2d8..ff58aa5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -14,6 +14,7 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUTargetMachine.h" +#include "SIRegisterInfo.h" using namespace llvm; @@ -24,18 +25,6 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} // they are not supported at this time. //===----------------------------------------------------------------------===// -// Dummy to not crash RegisterClassInfo. -static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister; - -const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs( - const MachineFunction *) const { - return &CalleeSavedReg; -} - -unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { - return AMDGPU::NoRegister; -} - unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { static const unsigned SubRegs[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, @@ -50,3 +39,34 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { #define GET_REGINFO_TARGET_DESC #include "AMDGPUGenRegisterInfo.inc" + +// Forced to be here by one .inc +const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( + const MachineFunction *MF) const { + CallingConv::ID CC = MF->getFunction()->getCallingConv(); + switch (CC) { + case CallingConv::C: + case CallingConv::Fast: + return CSR_AMDGPU_HighRegs_SaveList; + default: { + // Dummy to not crash RegisterClassInfo. + static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; + return &NoCalleeSavedReg; + } + } +} + +const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + switch (CC) { + case CallingConv::C: + case CallingConv::Fast: + return CSR_AMDGPU_HighRegs_RegMask; + default: + return nullptr; + } +} + +unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + return AMDGPU::NoRegister; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h index ef51aad..d8604d2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -16,10 +16,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H -#include "llvm/Target/TargetRegisterInfo.h" - #define GET_REGINFO_HEADER -#define GET_REGINFO_ENUM #include "AMDGPUGenRegisterInfo.inc" namespace llvm { @@ -33,9 +30,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { /// \returns the sub reg enum value for the given \p Channel /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) unsigned getSubRegFromChannel(unsigned Channel) const; - - const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h deleted file mode 100644 index ecd2ac7..0000000 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h +++ /dev/null @@ -1,193 +0,0 @@ -//===-- AMDGPURuntimeMetadata.h - AMDGPU Runtime Metadata -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// Enums and structure types used by runtime metadata. -/// -/// Runtime requests certain information (metadata) about kernels to be able -/// to execute the kernels and answer the queries about the kernels. -/// The metadata is represented as a note element in the .note ELF section of a -/// binary (code object). The desc field of the note element is a YAML string -/// consisting of key-value pairs. Each key is a string. Each value can be -/// an integer, a string, or an YAML sequence. There are 3 levels of YAML maps. -/// At the beginning of the YAML string is the module level YAML map. A -/// kernel-level YAML map is in the amd.Kernels sequence. A -/// kernel-argument-level map is in the amd.Args sequence. -/// -/// The format should be kept backward compatible. New enum values and bit -/// fields should be appended at the end. It is suggested to bump up the -/// revision number whenever the format changes and document the change -/// in the revision in this header. -/// -// -//===----------------------------------------------------------------------===// -// -#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H -#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H - -#include <cstdint> -#include <vector> -#include <string> - -namespace AMDGPU { - -namespace RuntimeMD { - - // Version and revision of runtime metadata - const unsigned char MDVersion = 2; - const unsigned char MDRevision = 0; - - // Name of keys for runtime metadata. - namespace KeyName { - const char MDVersion[] = "amd.MDVersion"; // Runtime metadata version - const char Language[] = "amd.Language"; // Language - const char LanguageVersion[] = "amd.LanguageVersion"; // Language version - const char Kernels[] = "amd.Kernels"; // Kernels - const char KernelName[] = "amd.KernelName"; // Kernel name - const char Args[] = "amd.Args"; // Kernel arguments - const char ArgSize[] = "amd.ArgSize"; // Kernel arg size - const char ArgAlign[] = "amd.ArgAlign"; // Kernel arg alignment - const char ArgTypeName[] = "amd.ArgTypeName"; // Kernel type name - const char ArgName[] = "amd.ArgName"; // Kernel name - const char ArgKind[] = "amd.ArgKind"; // Kernel argument kind - const char ArgValueType[] = "amd.ArgValueType"; // Kernel argument value type - const char ArgAddrQual[] = "amd.ArgAddrQual"; // Kernel argument address qualifier - const char ArgAccQual[] = "amd.ArgAccQual"; // Kernel argument access qualifier - const char ArgIsConst[] = "amd.ArgIsConst"; // Kernel argument is const qualified - const char ArgIsRestrict[] = "amd.ArgIsRestrict"; // Kernel argument is restrict qualified - const char ArgIsVolatile[] = "amd.ArgIsVolatile"; // Kernel argument is volatile qualified - const char ArgIsPipe[] = "amd.ArgIsPipe"; // Kernel argument is pipe qualified - const char ReqdWorkGroupSize[] = "amd.ReqdWorkGroupSize"; // Required work group size - const char WorkGroupSizeHint[] = "amd.WorkGroupSizeHint"; // Work group size hint - const char VecTypeHint[] = "amd.VecTypeHint"; // Vector type hint - const char KernelIndex[] = "amd.KernelIndex"; // Kernel index for device enqueue - const char NoPartialWorkGroups[] = "amd.NoPartialWorkGroups"; // No partial work groups - const char PrintfInfo[] = "amd.PrintfInfo"; // Prinf function call information - const char ArgActualAcc[] = "amd.ArgActualAcc"; // The actual kernel argument access qualifier - const char ArgPointeeAlign[] = "amd.ArgPointeeAlign"; // Alignment of pointee type - } - - namespace KernelArg { - enum Kind : uint8_t { - ByValue = 0, - GlobalBuffer = 1, - DynamicSharedPointer = 2, - Sampler = 3, - Image = 4, - Pipe = 5, - Queue = 6, - HiddenGlobalOffsetX = 7, - HiddenGlobalOffsetY = 8, - HiddenGlobalOffsetZ = 9, - HiddenNone = 10, - HiddenPrintfBuffer = 11, - HiddenDefaultQueue = 12, - HiddenCompletionAction = 13, - }; - - enum ValueType : uint16_t { - Struct = 0, - I8 = 1, - U8 = 2, - I16 = 3, - U16 = 4, - F16 = 5, - I32 = 6, - U32 = 7, - F32 = 8, - I64 = 9, - U64 = 10, - F64 = 11, - }; - - // Avoid using 'None' since it conflicts with a macro in X11 header file. - enum AccessQualifer : uint8_t { - AccNone = 0, - ReadOnly = 1, - WriteOnly = 2, - ReadWrite = 3, - }; - - enum AddressSpaceQualifer : uint8_t { - Private = 0, - Global = 1, - Constant = 2, - Local = 3, - Generic = 4, - Region = 5, - }; - } // namespace KernelArg - - // Invalid values are used to indicate an optional key should not be emitted. - const uint8_t INVALID_ADDR_QUAL = 0xff; - const uint8_t INVALID_ACC_QUAL = 0xff; - const uint32_t INVALID_KERNEL_INDEX = ~0U; - - namespace KernelArg { - // In-memory representation of kernel argument information. - struct Metadata { - uint32_t Size; - uint32_t Align; - uint32_t PointeeAlign; - uint8_t Kind; - uint16_t ValueType; - std::string TypeName; - std::string Name; - uint8_t AddrQual; - uint8_t AccQual; - uint8_t IsVolatile; - uint8_t IsConst; - uint8_t IsRestrict; - uint8_t IsPipe; - Metadata() : Size(0), Align(0), PointeeAlign(0), Kind(0), ValueType(0), - AddrQual(INVALID_ADDR_QUAL), AccQual(INVALID_ACC_QUAL), IsVolatile(0), - IsConst(0), IsRestrict(0), IsPipe(0) {} - }; - } - - namespace Kernel { - // In-memory representation of kernel information. - struct Metadata { - std::string Name; - std::string Language; - std::vector<uint8_t> LanguageVersion; - std::vector<uint32_t> ReqdWorkGroupSize; - std::vector<uint32_t> WorkGroupSizeHint; - std::string VecTypeHint; - uint32_t KernelIndex; - uint8_t NoPartialWorkGroups; - std::vector<KernelArg::Metadata> Args; - Metadata() : KernelIndex(INVALID_KERNEL_INDEX), NoPartialWorkGroups(0) {} - }; - } - - namespace Program { - // In-memory representation of program information. - struct Metadata { - std::vector<uint8_t> MDVersionSeq; - std::vector<std::string> PrintfInfo; - std::vector<Kernel::Metadata> Kernels; - - explicit Metadata(){} - - // Construct from an YAML string. - explicit Metadata(const std::string &YAML); - - // Convert to YAML string. - std::string toYAML(); - - // Convert from YAML string. - static Metadata fromYAML(const std::string &S); - }; - } -} // namespace RuntimeMD -} // namespace AMDGPU - -#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index c35a67d..7796176 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -13,8 +13,18 @@ //===----------------------------------------------------------------------===// #include "AMDGPUSubtarget.h" +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#ifdef LLVM_BUILD_GLOBAL_ISEL +#include "AMDGPUCallLowering.h" +#include "AMDGPUInstructionSelector.h" +#include "AMDGPULegalizerInfo.h" +#include "AMDGPURegisterBankInfo.h" +#endif +#include "SIMachineFunctionInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Target/TargetFrameLowering.h" #include <algorithm> @@ -22,7 +32,6 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-subtarget" -#define GET_SUBTARGETINFO_ENUM #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" @@ -41,9 +50,10 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // for SI has the unhelpful behavior that it unsets everything else if you // disable it. - SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,"); + SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. - FullFS += "+flat-for-global,+unaligned-buffer-access,"; + FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; + FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -59,9 +69,8 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // denormals, but should be checked. Should we issue a warning somewhere // if someone tries to enable these? if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - FP16Denormals = false; + FP64FP16Denormals = false; FP32Denormals = false; - FP64Denormals = false; } // Set defaults if needed. @@ -71,6 +80,31 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, return *this; } +#ifdef LLVM_BUILD_GLOBAL_ISEL +namespace { + +struct SIGISelActualAccessor : public GISelAccessor { + std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; + std::unique_ptr<InstructionSelector> InstSelector; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; + const AMDGPUCallLowering *getCallLowering() const override { + return CallLoweringInfo.get(); + } + const InstructionSelector *getInstructionSelector() const override { + return InstSelector.get(); + } + const LegalizerInfo *getLegalizerInfo() const override { + return Legalizer.get(); + } + const RegisterBankInfo *getRegBankInfo() const override { + return RegBankInfo.get(); + } +}; + +} // end anonymous namespace +#endif + AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) : AMDGPUGenSubtargetInfo(TT, GPU, FS), @@ -85,15 +119,18 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FastFMAF32(false), HalfRate64Ops(false), - FP16Denormals(false), FP32Denormals(false), - FP64Denormals(false), + FP64FP16Denormals(false), FPExceptions(false), + DX10Clamp(false), FlatForGlobal(false), + AutoWaitcntBeforeBarrier(false), UnalignedScratchAccess(false), UnalignedBufferAccess(false), + HasApertureRegs(false), EnableXNACK(false), + TrapHandler(false), DebuggerInsertNops(false), DebuggerReserveRegs(false), DebuggerEmitPrologue(false), @@ -110,14 +147,26 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), + GFX9Insts(false), SGPRInitBug(false), HasSMemRealTime(false), Has16BitInsts(false), + HasVOP3PInsts(false), HasMovrel(false), HasVGPRIndexMode(false), HasScalarStores(false), HasInv2PiInlineImm(false), + HasSDWA(false), + HasSDWAOmod(false), + HasSDWAScalar(false), + HasSDWASdst(false), + HasSDWAMac(false), + HasSDWAOutModsVOPC(false), + HasDPP(false), FlatAddressSpace(false), + FlatInstOffsets(false), + FlatGlobalInsts(false), + FlatScratchInsts(false), R600ALUInst(false), CaymanISA(false), @@ -128,65 +177,30 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FeatureDisable(false), InstrItins(getInstrItineraryForCPU(GPU)) { + AS = AMDGPU::getAMDGPUAS(TT); initializeSubtargetDependencies(TT, GPU, FS); } -// FIXME: These limits are for SI. Did they change with the larger maximum LDS -// size? -unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { - switch (NWaves) { - case 10: - return 1638; - case 9: - return 1820; - case 8: - return 2048; - case 7: - return 2340; - case 6: - return 2730; - case 5: - return 3276; - case 4: - return 4096; - case 3: - return 5461; - case 2: - return 8192; - default: +unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, + const Function &F) const { + if (NWaves == 1) return getLocalMemorySize(); - } + unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; + unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + unsigned MaxWaves = getMaxWavesPerEU(); + return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } -unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { - if (Bytes <= 1638) - return 10; - - if (Bytes <= 1820) - return 9; - - if (Bytes <= 2048) - return 8; - - if (Bytes <= 2340) - return 7; - - if (Bytes <= 2730) - return 6; - - if (Bytes <= 3276) - return 5; - - if (Bytes <= 4096) - return 4; - - if (Bytes <= 5461) - return 3; - - if (Bytes <= 8192) - return 2; - - return 1; +unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, + const Function &F) const { + unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; + unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + unsigned MaxWaves = getMaxWavesPerEU(); + unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; + unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); + NumWaves = std::min(NumWaves, MaxWaves); + NumWaves = std::max(NumWaves, 1u); + return NumWaves; } std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( @@ -224,7 +238,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( const Function &F) const { // Default minimum/maximum number of waves per execution unit. - std::pair<unsigned, unsigned> Default(1, 0); + std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); // Default/requested minimum/maximum flat work group sizes. std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); @@ -263,12 +277,74 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( // Make sure requested values are compatible with values implied by requested // minimum/maximum flat work group sizes. if (RequestedFlatWorkGroupSize && - Requested.first > MinImpliedByFlatWorkGroupSize) + Requested.first < MinImpliedByFlatWorkGroupSize) return Default; return Requested; } +bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { + Function *Kernel = I->getParent()->getParent(); + unsigned MinSize = 0; + unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; + bool IdQuery = false; + + // If reqd_work_group_size is present it narrows value down. + if (auto *CI = dyn_cast<CallInst>(I)) { + const Function *F = CI->getCalledFunction(); + if (F) { + unsigned Dim = UINT_MAX; + switch (F->getIntrinsicID()) { + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::r600_read_tidig_x: + IdQuery = true; + LLVM_FALLTHROUGH; + case Intrinsic::r600_read_local_size_x: + Dim = 0; + break; + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + IdQuery = true; + LLVM_FALLTHROUGH; + case Intrinsic::r600_read_local_size_y: + Dim = 1; + break; + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + IdQuery = true; + LLVM_FALLTHROUGH; + case Intrinsic::r600_read_local_size_z: + Dim = 2; + break; + default: + break; + } + if (Dim <= 3) { + if (auto Node = Kernel->getMetadata("reqd_work_group_size")) + if (Node->getNumOperands() == 3) + MinSize = MaxSize = mdconst::extract<ConstantInt>( + Node->getOperand(Dim))->getZExtValue(); + } + } + } + + if (!MaxSize) + return false; + + // Range metadata is [Lo, Hi). For ID query we need to pass max size + // as Hi. For size query we need to pass Hi + 1. + if (IdQuery) + MinSize = 0; + else + ++MaxSize; + + MDBuilder MDB(I->getContext()); + MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), + APInt(32, MaxSize)); + I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + return true; +} + R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) : AMDGPUSubtarget(TT, GPU, FS, TM), @@ -277,11 +353,23 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, TLInfo(TM, *this) {} SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, - const TargetMachine &TM) : - AMDGPUSubtarget(TT, GPU, FS, TM), - InstrInfo(*this), - FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), - TLInfo(TM, *this) {} + const TargetMachine &TM) + : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), + FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), + TLInfo(TM, *this) { +#ifndef LLVM_BUILD_GLOBAL_ISEL + GISelAccessor *GISel = new GISelAccessor(); +#else + SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); + GISel->CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); + GISel->Legalizer.reset(new AMDGPULegalizerInfo()); + + GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); + GISel->InstSelector.reset(new AMDGPUInstructionSelector( + *this, *static_cast<AMDGPURegisterBankInfo *>(GISel->RegBankInfo.get()))); +#endif + setGISelAccessor(*GISel); +} void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const { @@ -305,7 +393,7 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { } unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, - unsigned ExplicitArgBytes) const { + unsigned ExplicitArgBytes) const { unsigned ImplicitBytes = getImplicitArgNumBytes(MF); if (ImplicitBytes == 0) return ExplicitArgBytes; @@ -359,12 +447,100 @@ unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { return 1; } -unsigned SISubtarget::getMaxNumSGPRs() const { +unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + if (MFI.hasFlatScratchInit()) { + if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). + if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) + return 4; // FLAT_SCRATCH, VCC (in that order). + } + + if (isXNACKEnabled()) + return 4; // XNACK, VCC (in that order). + return 2; // VCC. +} + +unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { + const Function &F = *MF.getFunction(); + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + + // Compute maximum number of SGPRs function can use using default/requested + // minimum number of waves per execution unit. + std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); + unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); + unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); + + // Check if maximum number of SGPRs was explicitly requested using + // "amdgpu-num-sgpr" attribute. + if (F.hasFnAttribute("amdgpu-num-sgpr")) { + unsigned Requested = AMDGPU::getIntegerAttribute( + F, "amdgpu-num-sgpr", MaxNumSGPRs); + + // Make sure requested value does not violate subtarget's specifications. + if (Requested && (Requested <= getReservedNumSGPRs(MF))) + Requested = 0; + + // If more SGPRs are required to support the input user/system SGPRs, + // increase to accommodate them. + // + // FIXME: This really ends up using the requested number of SGPRs + number + // of reserved special registers in total. Theoretically you could re-use + // the last input registers for these special registers, but this would + // require a lot of complexity to deal with the weird aliasing. + unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); + if (Requested && Requested < InputNumSGPRs) + Requested = InputNumSGPRs; + + // Make sure requested value is compatible with values implied by + // default/requested minimum/maximum number of waves per execution unit. + if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) + Requested = 0; + if (WavesPerEU.second && + Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) + Requested = 0; + + if (Requested) + MaxNumSGPRs = Requested; + } + if (hasSGPRInitBug()) - return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; - if (getGeneration() >= VOLCANIC_ISLANDS) - return 102; + return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), + MaxAddressableNumSGPRs); +} + +unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { + const Function &F = *MF.getFunction(); + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + + // Compute maximum number of VGPRs function can use using default/requested + // minimum number of waves per execution unit. + std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); + unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); + + // Check if maximum number of VGPRs was explicitly requested using + // "amdgpu-num-vgpr" attribute. + if (F.hasFnAttribute("amdgpu-num-vgpr")) { + unsigned Requested = AMDGPU::getIntegerAttribute( + F, "amdgpu-num-vgpr", MaxNumVGPRs); + + // Make sure requested value does not violate subtarget's specifications. + if (Requested && Requested <= getReservedNumVGPRs(MF)) + Requested = 0; + + // Make sure requested value is compatible with values implied by + // default/requested minimum/maximum number of waves per execution unit. + if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) + Requested = 0; + if (WavesPerEU.second && + Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) + Requested = 0; + + if (Requested) + MaxNumVGPRs = Requested; + } - return 104; + return MaxNumVGPRs - getReservedNumVGPRs(MF); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 0e3cb7d..d4b6a5f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -16,12 +16,13 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H #include "AMDGPU.h" -#include "R600InstrInfo.h" -#include "R600ISelLowering.h" #include "R600FrameLowering.h" -#include "SIInstrInfo.h" -#include "SIISelLowering.h" +#include "R600ISelLowering.h" +#include "R600InstrInfo.h" #include "SIFrameLowering.h" +#include "SIISelLowering.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/GlobalISel/GISelAccessor.h" @@ -51,19 +52,47 @@ public: SOUTHERN_ISLANDS, SEA_ISLANDS, VOLCANIC_ISLANDS, + GFX9, }; enum { ISAVersion0_0_0, + ISAVersion6_0_0, + ISAVersion6_0_1, ISAVersion7_0_0, ISAVersion7_0_1, ISAVersion7_0_2, + ISAVersion7_0_3, ISAVersion8_0_0, ISAVersion8_0_1, ISAVersion8_0_2, ISAVersion8_0_3, ISAVersion8_0_4, ISAVersion8_1_0, + ISAVersion9_0_0, + ISAVersion9_0_1, + ISAVersion9_0_2, + ISAVersion9_0_3 + }; + + enum TrapHandlerAbi { + TrapHandlerAbiNone = 0, + TrapHandlerAbiHsa = 1 + }; + + enum TrapID { + TrapIDHardwareReserved = 0, + TrapIDHSADebugTrap = 1, + TrapIDLLVMTrap = 2, + TrapIDLLVMDebugTrap = 3, + TrapIDDebugBreakpoint = 7, + TrapIDDebugReserved8 = 8, + TrapIDDebugReservedFE = 0xfe, + TrapIDDebugReservedFF = 0xff + }; + + enum TrapRegValues { + LLVMTrapHandlerRegValue = 1 }; protected: @@ -81,14 +110,17 @@ protected: bool HalfRate64Ops; // Dynamially set bits that enable features. - bool FP16Denormals; bool FP32Denormals; - bool FP64Denormals; + bool FP64FP16Denormals; bool FPExceptions; + bool DX10Clamp; bool FlatForGlobal; + bool AutoWaitcntBeforeBarrier; bool UnalignedScratchAccess; bool UnalignedBufferAccess; + bool HasApertureRegs; bool EnableXNACK; + bool TrapHandler; bool DebuggerInsertNops; bool DebuggerReserveRegs; bool DebuggerEmitPrologue; @@ -107,14 +139,26 @@ protected: bool GCN1Encoding; bool GCN3Encoding; bool CIInsts; + bool GFX9Insts; bool SGPRInitBug; bool HasSMemRealTime; bool Has16BitInsts; + bool HasVOP3PInsts; bool HasMovrel; bool HasVGPRIndexMode; bool HasScalarStores; bool HasInv2PiInlineImm; + bool HasSDWA; + bool HasSDWAOmod; + bool HasSDWAScalar; + bool HasSDWASdst; + bool HasSDWAMac; + bool HasSDWAOutModsVOPC; + bool HasDPP; bool FlatAddressSpace; + bool FlatInstOffsets; + bool FlatGlobalInsts; + bool FlatScratchInsts; bool R600ALUInst; bool CaymanISA; bool CFALUBug; @@ -127,6 +171,7 @@ protected: InstrItineraryData InstrItins; SelectionDAGTargetInfo TSInfo; + AMDGPUAS AS; public: AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, @@ -161,7 +206,8 @@ public: } bool isOpenCLEnv() const { - return TargetTriple.getEnvironment() == Triple::OpenCL; + return TargetTriple.getEnvironment() == Triple::OpenCL || + TargetTriple.getEnvironmentName() == "amdgizcl"; } Generation getGeneration() const { @@ -184,10 +230,18 @@ public: return MaxPrivateElementSize; } + AMDGPUAS getAMDGPUAS() const { + return AS; + } + bool has16BitInsts() const { return Has16BitInsts; } + bool hasVOP3PInsts() const { + return HasVOP3PInsts; + } + bool hasHWFP64() const { return FP64; } @@ -243,6 +297,14 @@ public: return (getGeneration() >= EVERGREEN); } + bool hasMed3_16() const { + return getGeneration() >= GFX9; + } + + bool hasMin3Max3_16() const { + return getGeneration() >= GFX9; + } + bool hasCARRY() const { return (getGeneration() >= EVERGREEN); } @@ -255,6 +317,10 @@ public: return CaymanISA; } + TrapHandlerAbi getTrapHandlerAbi() const { + return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; + } + bool isPromoteAllocaEnabled() const { return EnablePromoteAlloca; } @@ -267,20 +333,22 @@ public: return DumpCode; } - bool enableIEEEBit(const MachineFunction &MF) const { - return AMDGPU::isCompute(MF.getFunction()->getCallingConv()); - } - /// Return the amount of LDS that can be used that will not restrict the /// occupancy lower than WaveCount. - unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const; + unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, + const Function &) const; /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if /// the given LDS memory size is the only constraint. - unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const; + unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; + + unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const { + const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); + return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction()); + } bool hasFP16Denormals() const { - return FP16Denormals; + return FP64FP16Denormals; } bool hasFP32Denormals() const { @@ -288,17 +356,33 @@ public: } bool hasFP64Denormals() const { - return FP64Denormals; + return FP64FP16Denormals; + } + + bool supportsMinMaxDenormModes() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; } bool hasFPExceptions() const { return FPExceptions; } + bool enableDX10Clamp() const { + return DX10Clamp; + } + + bool enableIEEEBit(const MachineFunction &MF) const { + return AMDGPU::isCompute(MF.getFunction()->getCallingConv()); + } + bool useFlatForGlobal() const { return FlatForGlobal; } + bool hasAutoWaitcntBeforeBarrier() const { + return AutoWaitcntBeforeBarrier; + } + bool hasUnalignedBufferAccess() const { return UnalignedBufferAccess; } @@ -307,10 +391,34 @@ public: return UnalignedScratchAccess; } + bool hasApertureRegs() const { + return HasApertureRegs; + } + + bool isTrapHandlerEnabled() const { + return TrapHandler; + } + bool isXNACKEnabled() const { return EnableXNACK; } + bool hasFlatAddressSpace() const { + return FlatAddressSpace; + } + + bool hasFlatInstOffsets() const { + return FlatInstOffsets; + } + + bool hasFlatGlobalInsts() const { + return FlatGlobalInsts; + } + + bool hasFlatScratchInsts() const { + return FlatScratchInsts; + } + bool isMesaKernel(const MachineFunction &MF) const { return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv()); } @@ -324,6 +432,34 @@ public: return isAmdHsaOS() || isMesaKernel(MF); } + bool hasFminFmaxLegacy() const { + return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; + } + + bool hasSDWA() const { + return HasSDWA; + } + + bool hasSDWAOmod() const { + return HasSDWAOmod; + } + + bool hasSDWAScalar() const { + return HasSDWAScalar; + } + + bool hasSDWASdst() const { + return HasSDWASdst; + } + + bool hasSDWAMac() const { + return HasSDWAMac; + } + + bool hasSDWAOutModsVOPC() const { + return HasSDWAOutModsVOPC; + } + /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { @@ -342,9 +478,11 @@ public: return 0; } + // Scratch is allocated in 256 dword per wave blocks for the entire + // wavefront. When viewed from the perspecive of an arbitrary workitem, this + // is 4-byte aligned. unsigned getStackAlignment() const { - // Scratch is allocated in 256 dword per wave blocks. - return 4 * 256 / getWavefrontSize(); + return 4; } bool enableMachineScheduler() const override { @@ -355,72 +493,71 @@ public: return true; } + void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} + bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} + /// \returns Number of execution units per compute unit supported by the /// subtarget. unsigned getEUsPerCU() const { - return 4; + return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits()); } /// \returns Maximum number of work groups per compute unit supported by the - /// subtarget and limited by given flat work group size. + /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const { - if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 8; - return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16; + return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(), + FlatWorkGroupSize); } /// \returns Maximum number of waves per compute unit supported by the /// subtarget without any kind of limitation. unsigned getMaxWavesPerCU() const { - return getMaxWavesPerEU() * getEUsPerCU(); + return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits()); } /// \returns Maximum number of waves per compute unit supported by the - /// subtarget and limited by given flat work group size. + /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { - return getWavesPerWorkGroup(FlatWorkGroupSize); + return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(), + FlatWorkGroupSize); } /// \returns Minimum number of waves per execution unit supported by the /// subtarget. unsigned getMinWavesPerEU() const { - return 1; + return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits()); } /// \returns Maximum number of waves per execution unit supported by the /// subtarget without any kind of limitation. unsigned getMaxWavesPerEU() const { - if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 8; - // FIXME: Need to take scratch memory into account. - return 10; + return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits()); } /// \returns Maximum number of waves per execution unit supported by the - /// subtarget and limited by given flat work group size. + /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const { - return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) / - getEUsPerCU(); + return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(), + FlatWorkGroupSize); } /// \returns Minimum flat work group size supported by the subtarget. unsigned getMinFlatWorkGroupSize() const { - return 1; + return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits()); } /// \returns Maximum flat work group size supported by the subtarget. unsigned getMaxFlatWorkGroupSize() const { - return 2048; + return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits()); } - /// \returns Number of waves per work group given the flat work group size. + /// \returns Number of waves per work group supported by the subtarget and + /// limited by given \p FlatWorkGroupSize. unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { - return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize(); + return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(), + FlatWorkGroupSize); } - void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} - bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} - /// \returns Subtarget's default pair of minimum/maximum flat work group sizes /// for function \p F, or minimum/maximum flat work group sizes explicitly /// requested using "amdgpu-flat-work-group-size" attribute attached to @@ -440,6 +577,9 @@ public: /// compatible with minimum/maximum number of waves limited by flat work group /// size, register usage, and/or lds usage. std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; + + /// Creates value range metadata on an workitemid.* inrinsic call or load. + bool makeLIDRangeMetadata(Instruction *I) const; }; class R600Subtarget final : public AMDGPUSubtarget { @@ -482,13 +622,6 @@ public: }; class SISubtarget final : public AMDGPUSubtarget { -public: - enum { - // The closed Vulkan driver sets 96, which limits the wave count to 8 but - // doesn't spill SGPRs as much as when 80 is set. - FIXED_SGPR_COUNT_FOR_INIT_BUG = 96 - }; - private: SIInstrInfo InstrInfo; SIFrameLowering FrameLowering; @@ -516,6 +649,21 @@ public: return GISel->getCallLowering(); } + const InstructionSelector *getInstructionSelector() const override { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getInstructionSelector(); + } + + const LegalizerInfo *getLegalizerInfo() const override { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getLegalizerInfo(); + } + + const RegisterBankInfo *getRegBankInfo() const override { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getRegBankInfo(); + } + const SIRegisterInfo *getRegisterInfo() const override { return &InstrInfo.getRegisterInfo(); } @@ -524,6 +672,11 @@ public: this->GISel.reset(&GISel); } + // XXX - Why is this here if it isn't in the default pass set? + bool enableEarlyIfConversion() const override { + return true; + } + void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; @@ -533,10 +686,6 @@ public: return 16; } - bool hasFlatAddressSpace() const { - return FlatAddressSpace; - } - bool hasSMemRealTime() const { return HasSMemRealTime; } @@ -549,6 +698,10 @@ public: return HasVGPRIndexMode; } + bool useVGPRIndexMode(bool UserEnable) const { + return !hasMovrel() || (UserEnable && hasVGPRIndexMode()); + } + bool hasScalarCompareEq64() const { return getGeneration() >= VOLCANIC_ISLANDS; } @@ -561,6 +714,10 @@ public: return HasInv2PiInlineImm; } + bool hasDPP() const { + return HasDPP; + } + bool enableSIScheduler() const { return EnableSIScheduler; } @@ -594,6 +751,14 @@ public: return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; } + bool hasSMovFedHazard() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; + } + + bool hasReadM0Hazard() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; + } + unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const; /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs @@ -602,13 +767,104 @@ public: /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; - /// \returns True if waitcnt instruction is needed before barrier instruction, - /// false otherwise. - bool needWaitcntBeforeBarrier() const { - return true; + /// \returns true if the flat_scratch register should be initialized with the + /// pointer to the wave's scratch memory rather than a size and offset. + bool flatScratchIsPointer() const { + return getGeneration() >= GFX9; + } + + /// \returns SGPR allocation granularity supported by the subtarget. + unsigned getSGPRAllocGranule() const { + return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits()); + } + + /// \returns SGPR encoding granularity supported by the subtarget. + unsigned getSGPREncodingGranule() const { + return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits()); + } + + /// \returns Total number of SGPRs supported by the subtarget. + unsigned getTotalNumSGPRs() const { + return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits()); + } + + /// \returns Addressable number of SGPRs supported by the subtarget. + unsigned getAddressableNumSGPRs() const { + return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits()); + } + + /// \returns Minimum number of SGPRs that meets the given number of waves per + /// execution unit requirement supported by the subtarget. + unsigned getMinNumSGPRs(unsigned WavesPerEU) const { + return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU); + } + + /// \returns Maximum number of SGPRs that meets the given number of waves per + /// execution unit requirement supported by the subtarget. + unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { + return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU, + Addressable); + } + + /// \returns Reserved number of SGPRs for given function \p MF. + unsigned getReservedNumSGPRs(const MachineFunction &MF) const; + + /// \returns Maximum number of SGPRs that meets number of waves per execution + /// unit requirement for function \p MF, or number of SGPRs explicitly + /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. + /// + /// \returns Value that meets number of waves per execution unit requirement + /// if explicitly requested value cannot be converted to integer, violates + /// subtarget's specifications, or does not meet number of waves per execution + /// unit requirement. + unsigned getMaxNumSGPRs(const MachineFunction &MF) const; + + /// \returns VGPR allocation granularity supported by the subtarget. + unsigned getVGPRAllocGranule() const { + return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits()); } - unsigned getMaxNumSGPRs() const; + /// \returns VGPR encoding granularity supported by the subtarget. + unsigned getVGPREncodingGranule() const { + return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits()); + } + + /// \returns Total number of VGPRs supported by the subtarget. + unsigned getTotalNumVGPRs() const { + return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits()); + } + + /// \returns Addressable number of VGPRs supported by the subtarget. + unsigned getAddressableNumVGPRs() const { + return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits()); + } + + /// \returns Minimum number of VGPRs that meets given number of waves per + /// execution unit requirement supported by the subtarget. + unsigned getMinNumVGPRs(unsigned WavesPerEU) const { + return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU); + } + + /// \returns Maximum number of VGPRs that meets given number of waves per + /// execution unit requirement supported by the subtarget. + unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { + return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU); + } + + /// \returns Reserved number of VGPRs for given function \p MF. + unsigned getReservedNumVGPRs(const MachineFunction &MF) const { + return debuggerReserveRegs() ? 4 : 0; + } + + /// \returns Maximum number of VGPRs that meets number of waves per execution + /// unit requirement for function \p MF, or number of VGPRs explicitly + /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. + /// + /// \returns Value that meets number of waves per execution unit requirement + /// if explicitly requested value cannot be converted to integer, violates + /// subtarget's specifications, or does not meet number of waves per execution + /// unit requirement. + unsigned getMaxNumVGPRs(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index d8a0c71..dc868f0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -15,34 +15,37 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" +#include "AMDGPUAliasAnalysis.h" #include "AMDGPUCallLowering.h" +#include "AMDGPUInstructionSelector.h" +#include "AMDGPULegalizerInfo.h" +#include "AMDGPUMacroFusion.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" +#include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" #include "R600MachineScheduler.h" #include "SIMachineScheduler.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Triple.h" -#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" -#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" +#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Transforms/IPO.h" -#include "llvm/Transforms/IPO/AlwaysInliner.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/GVN.h" -#include "llvm/Transforms/Vectorize.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Vectorize.h" #include <memory> using namespace llvm; @@ -58,6 +61,11 @@ static cl::opt<bool> EnableSROA( cl::ReallyHidden, cl::init(true)); +static cl::opt<bool> +EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, + cl::desc("Run early if-conversion"), + cl::init(false)); + static cl::opt<bool> EnableR600IfConvert( "r600-if-convert", cl::desc("Use if conversion pass"), @@ -75,6 +83,43 @@ static cl::opt<bool> EnableLoadStoreVectorizer( static cl::opt<bool> ScalarizeGlobal( "amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), + cl::init(true), + cl::Hidden); + +// Option to run internalize pass. +static cl::opt<bool> InternalizeSymbols( + "amdgpu-internalize-symbols", + cl::desc("Enable elimination of non-kernel functions and unused globals"), + cl::init(false), + cl::Hidden); + +// Option to inline all early. +static cl::opt<bool> EarlyInlineAll( + "amdgpu-early-inline-all", + cl::desc("Inline all functions early"), + cl::init(false), + cl::Hidden); + +static cl::opt<bool> EnableSDWAPeephole( + "amdgpu-sdwa-peephole", + cl::desc("Enable SDWA peepholer"), + cl::init(true)); + +// Enable address space based alias analysis +static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, + cl::desc("Enable AMDGPU Alias Analysis"), + cl::init(true)); + +// Option to enable new waitcnt insertion pass. +static cl::opt<bool> EnableSIInsertWaitcntsPass( + "enable-si-insert-waitcnts", + cl::desc("Use new waitcnt insertion pass"), + cl::init(true)); + +// Option to run late CFG structurizer +static cl::opt<bool> LateCFGStructurize( + "amdgpu-late-structurize", + cl::desc("Enable late CFG structurization"), cl::init(false), cl::Hidden); @@ -86,22 +131,29 @@ extern "C" void LLVMInitializeAMDGPUTarget() { PassRegistry *PR = PassRegistry::getPassRegistry(); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); + initializeSIFixVGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); + initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); + initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); + initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeSIInsertWaitsPass(*PR); + initializeSIInsertWaitcntsPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); + initializeAMDGPUUnifyDivergentExitNodesPass(*PR); + initializeAMDGPUAAWrapperPassPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -119,13 +171,27 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { ScheduleDAGMILive *DAG = - new ScheduleDAGMILive(C, - llvm::make_unique<GCNMaxOccupancySchedStrategy>(C)); + new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C)); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); + return DAG; +} + +static ScheduleDAGInstrs * +createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { + auto DAG = new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; } +static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { + return new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_MINREGFORCED); +} + static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); @@ -139,6 +205,16 @@ GCNMaxOccupancySchedRegistry("gcn-max-occupancy", "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler); +static MachineSchedRegistry +IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", + "Run GCN scheduler to maximize occupancy (experimental)", + createIterativeGCNMaxOccupancyMachineScheduler); + +static MachineSchedRegistry +GCNMinRegSchedRegistry("gcn-minreg", + "Run GCN iterative scheduler for minimal register usage (experimental)", + createMinRegScheduler); + static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. @@ -148,9 +224,14 @@ static StringRef computeDataLayout(const Triple &TT) { // 32-bit private, local, and region pointers. 64-bit global, constant and // flat. - return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + if (TT.getEnvironmentName() == "amdgiz" || + TT.getEnvironmentName() == "amdgizcl") + return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; + return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } LLVM_READNONE @@ -180,6 +261,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), TLOF(createTLOF(getTargetTriple())) { + AS = AMDGPU::getAMDGPUAS(TT); initAsmInfo(); } @@ -199,8 +281,74 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { FSAttr.getValueAsString(); } -void AMDGPUTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) { - PM.add(createAMDGPUUnifyMetadataPass()); +static ImmutablePass *createAMDGPUExternalAAWrapperPass() { + return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { + if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + }); +} + +void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { + Builder.DivergentTarget = true; + + bool Internalize = InternalizeSymbols && + (getOptLevel() > CodeGenOpt::None) && + (getTargetTriple().getArch() == Triple::amdgcn); + bool EarlyInline = EarlyInlineAll && + (getOptLevel() > CodeGenOpt::None); + bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None; + + Builder.addExtension( + PassManagerBuilder::EP_ModuleOptimizerEarly, + [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { + if (AMDGPUAA) { + PM.add(createAMDGPUAAWrapperPass()); + PM.add(createAMDGPUExternalAAWrapperPass()); + } + PM.add(createAMDGPUUnifyMetadataPass()); + if (Internalize) { + PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool { + if (const Function *F = dyn_cast<Function>(&GV)) { + if (F->isDeclaration()) + return true; + switch (F->getCallingConv()) { + default: + return false; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return true; + } + } + return !GV.use_empty(); + })); + PM.add(createGlobalDCEPass()); + } + if (EarlyInline) + PM.add(createAMDGPUAlwaysInlinePass(false)); + }); + + Builder.addExtension( + PassManagerBuilder::EP_EarlyAsPossible, + [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + if (AMDGPUAA) { + PM.add(createAMDGPUAAWrapperPass()); + PM.add(createAMDGPUExternalAAWrapperPass()); + } + }); + + Builder.addExtension( + PassManagerBuilder::EP_CGSCCOptimizerLate, + [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + // Add infer address spaces pass to the opt pipeline after inlining + // but before SROA to increase SROA opportunities. + PM.add(createInferAddressSpacesPass()); + }); } //===----------------------------------------------------------------------===// @@ -240,19 +388,6 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl( // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// -#ifdef LLVM_BUILD_GLOBAL_ISEL -namespace { - -struct SIGISelActualAccessor : public GISelAccessor { - std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; - const AMDGPUCallLowering *getCallLowering() const override { - return CallLoweringInfo.get(); - } -}; - -} // end anonymous namespace -#endif - GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, @@ -274,16 +409,6 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { // function that reside in TargetOptions. resetTargetOptions(F); I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this); - -#ifndef LLVM_BUILD_GLOBAL_ISEL - GISelAccessor *GISel = new GISelAccessor(); -#else - SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); - GISel->CallLoweringInfo.reset( - new AMDGPUCallLowering(*I->getTargetLowering())); -#endif - - I->setGISelAccessor(*GISel); } I->setScalarizeGlobalBehavior(ScalarizeGlobal); @@ -299,7 +424,7 @@ namespace { class AMDGPUPassConfig : public TargetPassConfig { public: - AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) + AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { // Exceptions and StackMaps are not supported, so these passes will never do // anything. @@ -330,7 +455,7 @@ public: class R600PassConfig final : public AMDGPUPassConfig { public: - R600PassConfig(TargetMachine *TM, PassManagerBase &PM) + R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) {} ScheduleDAGInstrs *createMachineScheduler( @@ -346,7 +471,7 @@ public: class GCNPassConfig final : public AMDGPUPassConfig { public: - GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) + GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) {} GCNTargetMachine &getGCNTargetMachine() const { @@ -356,9 +481,9 @@ public: ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override; - void addIRPasses() override; bool addPreISel() override; void addMachineSSAOptimization() override; + bool addILPOpts() override; bool addInstSelector() override; #ifdef LLVM_BUILD_GLOBAL_ISEL bool addIRTranslator() override; @@ -406,11 +531,15 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { } void AMDGPUPassConfig::addIRPasses() { + const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + // There is no reason to run these. disablePass(&StackMapLivenessID); disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + addPass(createAMDGPULowerIntrinsicsPass()); + // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerLegacyPass()); @@ -421,17 +550,32 @@ void AMDGPUPassConfig::addIRPasses() { // without ever running any passes on the second. addPass(createBarrierNoopPass()); + if (TM.getTargetTriple().getArch() == Triple::amdgcn) { + // TODO: May want to move later or split into an early and late one. + + addPass(createAMDGPUCodeGenPreparePass()); + } + // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. addPass(createAMDGPUOpenCLImageTypeLoweringPass()); - const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); if (TM.getOptLevel() > CodeGenOpt::None) { - addPass(createAMDGPUPromoteAlloca(&TM)); + addPass(createInferAddressSpacesPass()); + addPass(createAMDGPUPromoteAlloca()); if (EnableSROA) addPass(createSROAPass()); addStraightLineScalarOptimizationPasses(); + + if (EnableAMDGPUAliasAnalysis) { + addPass(createAMDGPUAAWrapperPass()); + addPass(createExternalAAWrapperPass([](Pass &P, Function &, + AAResults &AAR) { + if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + })); + } } TargetPassConfig::addIRPasses(); @@ -487,26 +631,26 @@ bool R600PassConfig::addPreISel() { } void R600PassConfig::addPreRegAlloc() { - addPass(createR600VectorRegMerger(*TM)); + addPass(createR600VectorRegMerger()); } void R600PassConfig::addPreSched2() { addPass(createR600EmitClauseMarkers(), false); if (EnableR600IfConvert) addPass(&IfConverterID, false); - addPass(createR600ClauseMergePass(*TM), false); + addPass(createR600ClauseMergePass(), false); } void R600PassConfig::addPreEmitPass() { addPass(createAMDGPUCFGStructurizerPass(), false); - addPass(createR600ExpandSpecialInstrsPass(*TM), false); + addPass(createR600ExpandSpecialInstrsPass(), false); addPass(&FinalizeMachineBundlesID, false); - addPass(createR600Packetizer(*TM), false); - addPass(createR600ControlFlowFinalizer(*TM), false); + addPass(createR600Packetizer(), false); + addPass(createR600ControlFlowFinalizer(), false); } TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { - return new R600PassConfig(this, PM); + return new R600PassConfig(*this, PM); } //===----------------------------------------------------------------------===// @@ -526,12 +670,19 @@ bool GCNPassConfig::addPreISel() { // FIXME: We need to run a pass to propagate the attributes when calls are // supported. - addPass(&AMDGPUAnnotateKernelFeaturesID); - addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions + addPass(createAMDGPUAnnotateKernelFeaturesPass()); + + // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit + // regions formed by them. + addPass(&AMDGPUUnifyDivergentExitNodesID); + if (!LateCFGStructurize) { + addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions + } addPass(createSinkingPass()); - addPass(createSITypeRewriter()); addPass(createAMDGPUAnnotateUniformValues()); - addPass(createSIAnnotateControlFlowPass()); + if (!LateCFGStructurize) { + addPass(createSIAnnotateControlFlowPass()); + } return false; } @@ -549,13 +700,22 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&SIFoldOperandsID); addPass(&DeadMachineInstructionElimID); addPass(&SILoadStoreOptimizerID); + if (EnableSDWAPeephole) { + addPass(&SIPeepholeSDWAID); + addPass(&MachineLICMID); + addPass(&MachineCSEID); + addPass(&SIFoldOperandsID); + addPass(&DeadMachineInstructionElimID); + } + addPass(createSIShrinkInstructionsPass()); } -void GCNPassConfig::addIRPasses() { - // TODO: May want to move later or split into an early and late one. - addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine())); +bool GCNPassConfig::addILPOpts() { + if (EnableEarlyIfConversion) + addPass(&EarlyIfConverterID); - AMDGPUPassConfig::addIRPasses(); + TargetPassConfig::addILPOpts(); + return false; } bool GCNPassConfig::addInstSelector() { @@ -572,20 +732,26 @@ bool GCNPassConfig::addIRTranslator() { } bool GCNPassConfig::addLegalizeMachineIR() { + addPass(new Legalizer()); return false; } bool GCNPassConfig::addRegBankSelect() { + addPass(new RegBankSelect()); return false; } bool GCNPassConfig::addGlobalInstructionSelect() { + addPass(new InstructionSelect()); return false; } + #endif void GCNPassConfig::addPreRegAlloc() { - addPass(createSIShrinkInstructionsPass()); + if (LateCFGStructurize) { + addPass(createAMDGPUMachineCFGStructurizerPass()); + } addPass(createSIWholeQuadModePass()); } @@ -615,6 +781,7 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { } void GCNPassConfig::addPostRegAlloc() { + addPass(&SIFixVGPRCopiesID); addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); } @@ -633,7 +800,10 @@ void GCNPassConfig::addPreEmitPass() { // cases. addPass(&PostRAHazardRecognizerID); - addPass(createSIInsertWaitsPass()); + if (EnableSIInsertWaitcntsPass) + addPass(createSIInsertWaitcntsPass()); + else + addPass(createSIInsertWaitsPass()); addPass(createSIShrinkInstructionsPass()); addPass(&SIInsertSkipsPassID); addPass(createSIDebuggerInsertNopsPass()); @@ -641,5 +811,6 @@ void GCNPassConfig::addPreEmitPass() { } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { - return new GCNPassConfig(this, PM); + return new GCNPassConfig(*this, PM); } + diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 9496773..a3c7c19 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -35,6 +35,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { protected: std::unique_ptr<TargetLoweringObjectFile> TLOF; AMDGPUIntrinsicInfo IntrinsicInfo; + AMDGPUAS AS; StringRef getGPUName(const Function &F) const; StringRef getFeatureString(const Function &F) const; @@ -57,7 +58,17 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } - void addEarlyAsPossiblePasses(PassManagerBase &PM) override; + AMDGPUAS getAMDGPUAS() const { + return AS; + } + + void adjustPassManager(PassManagerBuilder &) override; + /// Get the integer value of a null pointer in the given address space. + uint64_t getNullPointerValue(unsigned AddrSpace) const { + if (AddrSpace == AS.LOCAL_ADDRESS || AddrSpace == AS.REGION_ADDRESS) + return -1; + return 0; + } }; //===----------------------------------------------------------------------===// @@ -77,6 +88,10 @@ public: TargetPassConfig *createPassConfig(PassManagerBase &PM) override; const R600Subtarget *getSubtargetImpl(const Function &) const override; + + bool isMachineVerifierClean() const override { + return false; + } }; //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp index 1fddc88..6c1885e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -9,10 +9,11 @@ #include "AMDGPUTargetObjectFile.h" #include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" -#include "llvm/Support/ELF.h" -#include "Utils/AMDGPUBaseInfo.h" using namespace llvm; @@ -22,7 +23,8 @@ using namespace llvm; MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { - if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO) && + auto AS = static_cast<const AMDGPUTargetMachine*>(&TM)->getAMDGPUAS(); + if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO, AS) && AMDGPU::shouldEmitConstantsToTextSection(TM.getTargetTriple())) return TextSection; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h index de32778..ca6210f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -16,6 +16,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H +#include "AMDGPU.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/Target/TargetMachine.h" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index e904870..89a0390 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -20,8 +20,8 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" @@ -29,8 +29,41 @@ using namespace llvm; #define DEBUG_TYPE "AMDGPUtti" +static cl::opt<unsigned> UnrollThresholdPrivate( + "amdgpu-unroll-threshold-private", + cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), + cl::init(2500), cl::Hidden); + +static cl::opt<unsigned> UnrollThresholdLocal( + "amdgpu-unroll-threshold-local", + cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), + cl::init(1000), cl::Hidden); + +static cl::opt<unsigned> UnrollThresholdIf( + "amdgpu-unroll-threshold-if", + cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), + cl::init(150), cl::Hidden); + +static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, + unsigned Depth = 0) { + const Instruction *I = dyn_cast<Instruction>(Cond); + if (!I) + return false; -void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, + for (const Value *V : I->operand_values()) { + if (!L->contains(I)) + continue; + if (const PHINode *PHI = dyn_cast<PHINode>(V)) { + if (none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) { + return SubLoop->contains(PHI); })) + return true; + } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1)) + return true; + } + return false; +} + +void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { UP.Threshold = 300; // Twice the default. UP.MaxCount = UINT_MAX; @@ -38,36 +71,122 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, // TODO: Do we want runtime unrolling? + // Maximum alloca size than can fit registers. Reserve 16 registers. + const unsigned MaxAlloca = (256 - 16) * 4; + unsigned ThresholdPrivate = UnrollThresholdPrivate; + unsigned ThresholdLocal = UnrollThresholdLocal; + unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); + AMDGPUAS ASST = ST->getAMDGPUAS(); for (const BasicBlock *BB : L->getBlocks()) { const DataLayout &DL = BB->getModule()->getDataLayout(); + unsigned LocalGEPsSeen = 0; + + if (any_of(L->getSubLoops(), [BB](const Loop* SubLoop) { + return SubLoop->contains(BB); })) + continue; // Block belongs to an inner loop. + for (const Instruction &I : *BB) { + + // Unroll a loop which contains an "if" statement whose condition + // defined by a PHI belonging to the loop. This may help to eliminate + // if region and potentially even PHI itself, saving on both divergence + // and registers used for the PHI. + // Add a small bonus for each of such "if" statements. + if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) { + if (UP.Threshold < MaxBoost && Br->isConditional()) { + if (L->isLoopExiting(Br->getSuccessor(0)) || + L->isLoopExiting(Br->getSuccessor(1))) + continue; + if (dependsOnLocalPhi(L, Br->getCondition())) { + UP.Threshold += UnrollThresholdIf; + DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold + << " for loop:\n" << *L << " due to " << *Br << '\n'); + if (UP.Threshold >= MaxBoost) + return; + } + } + continue; + } + const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); - if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + if (!GEP) + continue; + + unsigned AS = GEP->getAddressSpace(); + unsigned Threshold = 0; + if (AS == ASST.PRIVATE_ADDRESS) + Threshold = ThresholdPrivate; + else if (AS == ASST.LOCAL_ADDRESS) + Threshold = ThresholdLocal; + else + continue; + + if (UP.Threshold >= Threshold) continue; - const Value *Ptr = GEP->getPointerOperand(); - const AllocaInst *Alloca = - dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); - if (Alloca) { - // We want to do whatever we can to limit the number of alloca - // instructions that make it through to the code generator. allocas - // require us to use indirect addressing, which is slow and prone to - // compiler bugs. If this loop does an address calculation on an - // alloca ptr, then we want to use a higher than normal loop unroll - // threshold. This will give SROA a better chance to eliminate these - // allocas. - // - // Don't use the maximum allowed value here as it will make some - // programs way too big. - UP.Threshold = 800; + if (AS == ASST.PRIVATE_ADDRESS) { + const Value *Ptr = GEP->getPointerOperand(); + const AllocaInst *Alloca = + dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); + if (!Alloca || !Alloca->isStaticAlloca()) + continue; + Type *Ty = Alloca->getAllocatedType(); + unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; + if (AllocaSize > MaxAlloca) + continue; + } else if (AS == ASST.LOCAL_ADDRESS) { + LocalGEPsSeen++; + // Inhibit unroll for local memory if we have seen addressing not to + // a variable, most likely we will be unable to combine it. + // Do not unroll too deep inner loops for local memory to give a chance + // to unroll an outer loop for a more important reason. + if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 || + (!isa<GlobalVariable>(GEP->getPointerOperand()) && + !isa<Argument>(GEP->getPointerOperand()))) + continue; } + + // Check if GEP depends on a value defined by this loop itself. + bool HasLoopDef = false; + for (const Value *Op : GEP->operands()) { + const Instruction *Inst = dyn_cast<Instruction>(Op); + if (!Inst || L->isLoopInvariant(Op)) + continue; + + if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { + return SubLoop->contains(Inst); })) + continue; + HasLoopDef = true; + break; + } + if (!HasLoopDef) + continue; + + // We want to do whatever we can to limit the number of alloca + // instructions that make it through to the code generator. allocas + // require us to use indirect addressing, which is slow and prone to + // compiler bugs. If this loop does an address calculation on an + // alloca ptr, then we want to use a higher than normal loop unroll + // threshold. This will give SROA a better chance to eliminate these + // allocas. + // + // We also want to have more unrolling for local memory to let ds + // instructions with different offsets combine. + // + // Don't use the maximum allowed value here as it will make some + // programs way too big. + UP.Threshold = Threshold; + DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n" + << *L << " due to " << *GEP << '\n'); + if (UP.Threshold >= MaxBoost) + return; } } } -unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { - if (Vec) - return 0; +unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { + // The concept of vector registers doesn't really exist. Some packed vector + // operations operate on the normal 32-bit registers. // Number of VGPRs on SI. if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) @@ -76,35 +195,73 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { return 4 * 128; // XXX - 4 channels. Should these count as vector instead? } -unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { - return Vector ? 0 : 32; +unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const { + // This is really the number of registers to fill when vectorizing / + // interleaving loops, so we lie to avoid trying to use all registers. + return getHardwareNumberOfRegisters(Vec) >> 3; +} + +unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const { + return 32; +} + +unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const { + return 32; } unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { - switch (AddrSpace) { - case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS: - case AMDGPUAS::FLAT_ADDRESS: + AMDGPUAS AS = ST->getAMDGPUAS(); + if (AddrSpace == AS.GLOBAL_ADDRESS || + AddrSpace == AS.CONSTANT_ADDRESS || + AddrSpace == AS.FLAT_ADDRESS) return 128; - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::REGION_ADDRESS: + if (AddrSpace == AS.LOCAL_ADDRESS || + AddrSpace == AS.REGION_ADDRESS) return 64; - case AMDGPUAS::PRIVATE_ADDRESS: + if (AddrSpace == AS.PRIVATE_ADDRESS) return 8 * ST->getMaxPrivateElementSize(); - default: - if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && - (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS || - AddrSpace == AMDGPUAS::PARAM_I_ADDRESS || - (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 && - AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15))) - return 128; - llvm_unreachable("unhandled address space"); + + if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && + (AddrSpace == AS.PARAM_D_ADDRESS || + AddrSpace == AS.PARAM_I_ADDRESS || + (AddrSpace >= AS.CONSTANT_BUFFER_0 && + AddrSpace <= AS.CONSTANT_BUFFER_15))) + return 128; + llvm_unreachable("unhandled address space"); +} + +bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + // We allow vectorization of flat stores, even though we may need to decompose + // them later if they may access private memory. We don't have enough context + // here, and legalization can handle it. + if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) { + return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && + ChainSizeInBytes <= ST->getMaxPrivateElementSize(); } + return true; +} + +bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); +} + +bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); } unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { - // Semi-arbitrary large amount. - return 64; + // Disable unrolling if the loop is not vectorized. + // TODO: Enable this again. + if (VF == 1) + return 1; + + return 8; } int AMDGPUTTIImpl::getArithmeticInstrCost( @@ -216,28 +373,29 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index) { switch (Opcode) { case Instruction::ExtractElement: - case Instruction::InsertElement: + case Instruction::InsertElement: { + unsigned EltSize + = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); + if (EltSize < 32) { + if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) + return 0; + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + } + // Extracts are just reads of a subregister, so are free. Inserts are // considered free because we don't want to have any cost for scalarizing // operations, and we don't have to copy into a different register class. // Dynamic indexing isn't free and is best avoided. return Index == ~0u ? 2 : 0; + } default: return BaseT::getVectorInstrCost(Opcode, ValTy, Index); } } -static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, - const IntrinsicInst *I) { +static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) { switch (I->getIntrinsicID()) { - default: - return false; - case Intrinsic::not_intrinsic: - // This means we have an intrinsic that isn't defined in - // IntrinsicsAMDGPU.td - break; - case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::amdgcn_workitem_id_z: @@ -249,6 +407,8 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, case Intrinsic::r600_read_tidig_x: case Intrinsic::r600_read_tidig_y: case Intrinsic::r600_read_tidig_z: + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_image_atomic_swap: case Intrinsic::amdgcn_image_atomic_add: case Intrinsic::amdgcn_image_atomic_sub: @@ -274,16 +434,10 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, case Intrinsic::amdgcn_buffer_atomic_xor: case Intrinsic::amdgcn_buffer_atomic_cmpswap: case Intrinsic::amdgcn_ps_live: + case Intrinsic::amdgcn_ds_swizzle: return true; - } - - StringRef Name = I->getCalledFunction()->getName(); - switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) { default: return false; - case AMDGPUIntrinsic::SI_fs_interp: - case AMDGPUIntrinsic::SI_fs_constant: - return true; } } @@ -291,16 +445,24 @@ static bool isArgPassedInSGPR(const Argument *A) { const Function *F = A->getParent(); // Arguments to compute shaders are never a source of divergence. - if (!AMDGPU::isShader(F->getCallingConv())) - return true; - - // For non-compute shaders, SGPR inputs are marked with either inreg or byval. - if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) || - F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal)) + CallingConv::ID CC = F->getCallingConv(); + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: return true; - - // Everything else is in VGPRs. - return false; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + // For non-compute shaders, SGPR inputs are marked with either inreg or byval. + // Everything else is in VGPRs. + return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) || + F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal); + default: + // TODO: Should calls support inreg for SGPR inputs? + return false; + } } /// @@ -318,7 +480,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { // All other loads are not divergent, because if threads issue loads with the // same arguments, they will always get the same result. if (const LoadInst *Load = dyn_cast<LoadInst>(V)) - return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS; // Atomics are divergent because they are executed sequentially: when an // atomic operation refers to the same address in each thread, then each @@ -327,10 +489,8 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)) return true; - if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { - const TargetMachine &TM = getTLI()->getTargetMachine(); - return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic); - } + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) + return isIntrinsicSourceOfDivergence(Intrinsic); // Assume all function calls are a source of divergence. if (isa<CallInst>(V) || isa<InvokeInst>(V)) @@ -338,3 +498,39 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { return false; } + +bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const { + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + return true; + } + } + return false; +} + +unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { + if (ST->hasVOP3PInsts()) { + VectorType *VT = cast<VectorType>(Tp); + if (VT->getNumElements() == 2 && + DL.getTypeSizeInBits(VT->getElementType()) == 16) { + // With op_sel VOP3P instructions freely can access the low half or high + // half of a register, so any swizzle is free. + + switch (Kind) { + case TTI::SK_Broadcast: + case TTI::SK_Reverse: + case TTI::SK_PermuteSingleSrc: + return 0; + default: + break; + } + } + } + + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 0d83b2a..9a320bd 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -32,6 +32,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { const AMDGPUSubtarget *ST; const AMDGPUTargetLowering *TLI; + bool IsGraphicsShader; const AMDGPUSubtarget *getST() const { return ST; } const AMDGPUTargetLowering *getTLI() const { return TLI; } @@ -62,20 +63,35 @@ public: explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), - TLI(ST->getTargetLowering()) {} + TLI(ST->getTargetLowering()), + IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {} bool hasBranchDivergence() { return true; } - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); return TTI::PSK_FastHardware; } - unsigned getNumberOfRegisters(bool Vector); - unsigned getRegisterBitWidth(bool Vector); + unsigned getHardwareNumberOfRegisters(bool Vector) const; + unsigned getNumberOfRegisters(bool Vector) const; + unsigned getRegisterBitWidth(bool Vector) const ; + unsigned getMinVectorRegisterBitWidth() const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; + + bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + unsigned getMaxInterleaveFactor(unsigned VF); int getArithmeticInstrCost( @@ -90,8 +106,21 @@ public: int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); bool isSourceOfDivergence(const Value *V) const; + bool isAlwaysUniform(const Value *V) const; + + unsigned getFlatAddressSpace() const { + // Don't bother running InferAddressSpaces pass on graphics shaders which + // don't use flat addressing. + if (IsGraphicsShader) + return -1; + return ST->hasFlatAddressSpace() ? + ST->getAMDGPUAS().FLAT_ADDRESS : ST->getAMDGPUAS().UNKNOWN_ADDRESS_SPACE; + } unsigned getVectorSplitCost() { return 0; } + + unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp); }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp new file mode 100644 index 0000000..309913f --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -0,0 +1,225 @@ +//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring +// there is at most one ret and one unreachable instruction, it ensures there is +// at most one divergent exiting block. +// +// StructurizeCFG can't deal with multi-exit regions formed by branches to +// multiple return nodes. It is not desirable to structurize regions with +// uniform branches, so unifying those to the same return block as divergent +// branches inhibits use of scalar branching. It still can't deal with the case +// where one branch goes to return, and one unreachable. Replace unreachable in +// this case with a return. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes" + +namespace { + +class AMDGPUUnifyDivergentExitNodes : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) { + initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry()); + } + + // We can preserve non-critical-edgeness when we unify function exit nodes + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; +}; + +} + +char AMDGPUUnifyDivergentExitNodes::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, + "Unify divergent function exit nodes", false, false) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, + "Unify divergent function exit nodes", false, false) + +char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID; + +void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ + // TODO: Preserve dominator tree. + AU.addRequired<PostDominatorTreeWrapperPass>(); + + AU.addRequired<DivergenceAnalysis>(); + + // No divergent values are changed, only blocks and branch edges. + AU.addPreserved<DivergenceAnalysis>(); + + // We preserve the non-critical-edgeness property + AU.addPreservedID(BreakCriticalEdgesID); + + // This is a cluster of orthogonal Transforms + AU.addPreservedID(LowerSwitchID); + FunctionPass::getAnalysisUsage(AU); + + AU.addRequired<TargetTransformInfoWrapperPass>(); +} + +/// \returns true if \p BB is reachable through only uniform branches. +/// XXX - Is there a more efficient way to find this? +static bool isUniformlyReached(const DivergenceAnalysis &DA, + BasicBlock &BB) { + SmallVector<BasicBlock *, 8> Stack; + SmallPtrSet<BasicBlock *, 8> Visited; + + for (BasicBlock *Pred : predecessors(&BB)) + Stack.push_back(Pred); + + while (!Stack.empty()) { + BasicBlock *Top = Stack.pop_back_val(); + if (!DA.isUniform(Top->getTerminator())) + return false; + + for (BasicBlock *Pred : predecessors(Top)) { + if (Visited.insert(Pred).second) + Stack.push_back(Pred); + } + } + + return true; +} + +static BasicBlock *unifyReturnBlockSet(Function &F, + ArrayRef<BasicBlock *> ReturningBlocks, + const TargetTransformInfo &TTI, + StringRef Name) { + // Otherwise, we need to insert a new basic block into the function, add a PHI + // nodes (if the function returns values), and convert all of the return + // instructions into unconditional branches. + // + BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); + + PHINode *PN = nullptr; + if (F.getReturnType()->isVoidTy()) { + ReturnInst::Create(F.getContext(), nullptr, NewRetBlock); + } else { + // If the function doesn't return void... add a PHI node to the block... + PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(), + "UnifiedRetVal"); + NewRetBlock->getInstList().push_back(PN); + ReturnInst::Create(F.getContext(), PN, NewRetBlock); + } + + // Loop over all of the blocks, replacing the return instruction with an + // unconditional branch. + // + for (BasicBlock *BB : ReturningBlocks) { + // Add an incoming element to the PHI node for every return instruction that + // is merging into this new block... + if (PN) + PN->addIncoming(BB->getTerminator()->getOperand(0), BB); + + BB->getInstList().pop_back(); // Remove the return insn + BranchInst::Create(NewRetBlock, BB); + } + + for (BasicBlock *BB : ReturningBlocks) { + // Cleanup possible branch to unconditional branch to the return. + SimplifyCFG(BB, TTI, 2); + } + + return NewRetBlock; +} + +bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { + auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); + if (PDT.getRoots().size() <= 1) + return false; + + DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>(); + + // Loop over all of the blocks in a function, tracking all of the blocks that + // return. + // + SmallVector<BasicBlock *, 4> ReturningBlocks; + SmallVector<BasicBlock *, 4> UnreachableBlocks; + + for (BasicBlock *BB : PDT.getRoots()) { + if (isa<ReturnInst>(BB->getTerminator())) { + if (!isUniformlyReached(DA, *BB)) + ReturningBlocks.push_back(BB); + } else if (isa<UnreachableInst>(BB->getTerminator())) { + if (!isUniformlyReached(DA, *BB)) + UnreachableBlocks.push_back(BB); + } + } + + if (!UnreachableBlocks.empty()) { + BasicBlock *UnreachableBlock = nullptr; + + if (UnreachableBlocks.size() == 1) { + UnreachableBlock = UnreachableBlocks.front(); + } else { + UnreachableBlock = BasicBlock::Create(F.getContext(), + "UnifiedUnreachableBlock", &F); + new UnreachableInst(F.getContext(), UnreachableBlock); + + for (BasicBlock *BB : UnreachableBlocks) { + BB->getInstList().pop_back(); // Remove the unreachable inst. + BranchInst::Create(UnreachableBlock, BB); + } + } + + if (!ReturningBlocks.empty()) { + // Don't create a new unreachable inst if we have a return. The + // structurizer/annotator can't handle the multiple exits + + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy); + UnreachableBlock->getInstList().pop_back(); // Remove the unreachable inst. + + Function *UnreachableIntrin = + Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable); + + // Insert a call to an intrinsic tracking that this is an unreachable + // point, in case we want to kill the active lanes or something later. + CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock); + + // Don't create a scalar trap. We would only want to trap if this code was + // really reached, but a scalar trap would happen even if no lanes + // actually reached here. + ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock); + ReturningBlocks.push_back(UnreachableBlock); + } + } + + // Now handle return blocks. + if (ReturningBlocks.empty()) + return false; // No blocks return + + if (ReturningBlocks.size() == 1) + return false; // Already has a single return block + + const TargetTransformInfo &TTI + = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock"); + return true; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp index bf501a1..3a0c3ed 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp @@ -13,38 +13,39 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include <algorithm> +#include <cassert> using namespace llvm; namespace { + namespace kOCLMD { + const char SpirVer[] = "opencl.spir.version"; const char OCLVer[] = "opencl.ocl.version"; const char UsedExt[] = "opencl.used.extensions"; const char UsedOptCoreFeat[] = "opencl.used.optional.core.features"; const char CompilerOptions[] = "opencl.compiler.options"; const char LLVMIdent[] = "llvm.ident"; - } + + } // end namespace kOCLMD /// \brief Unify multiple OpenCL metadata due to linking. - class AMDGPUUnifyMetadata : public FunctionPass { + class AMDGPUUnifyMetadata : public ModulePass { public: static char ID; - explicit AMDGPUUnifyMetadata() : FunctionPass(ID) {}; + explicit AMDGPUUnifyMetadata() : ModulePass(ID) {}; private: - // This should really be a module pass but we have to run it as early - // as possible, so given function passes are executed first and - // TargetMachine::addEarlyAsPossiblePasses() expects only function passes - // it has to be a function pass. virtual bool runOnModule(Module &M); - // \todo: Convert to a module pass. - virtual bool runOnFunction(Function &F); - /// \brief Unify version metadata. /// \return true if changes are made. /// Assume the named metadata has operands each of which is a pair of @@ -117,7 +118,7 @@ INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata", "Unify multiple OpenCL metadata due to linking", false, false) -FunctionPass* llvm::createAMDGPUUnifyMetadataPass() { +ModulePass* llvm::createAMDGPUUnifyMetadataPass() { return new AMDGPUUnifyMetadata(); } @@ -143,7 +144,3 @@ bool AMDGPUUnifyMetadata::runOnModule(Module &M) { return Changed; } - -bool AMDGPUUnifyMetadata::runOnFunction(Function &F) { - return runOnModule(*F.getParent()); -} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 7faeccd..1a39384 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -9,27 +9,40 @@ //==-----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "AMDGPUInstrInfo.h" #include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" +#include "R600RegisterInfo.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Dominators.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" +#include <cassert> +#include <cstddef> #include <deque> +#include <iterator> +#include <map> +#include <utility> +#include <vector> using namespace llvm; @@ -53,15 +66,19 @@ STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); namespace llvm { + void initializeAMDGPUCFGStructurizerPass(PassRegistry&); -} + +} // end namespace llvm + +namespace { //===----------------------------------------------------------------------===// // // Miscellaneous utility for CFGStructurizer. // //===----------------------------------------------------------------------===// -namespace { + #define SHOWNEWINSTR(i) \ DEBUG(dbgs() << "New instr: " << *i << "\n"); @@ -82,35 +99,19 @@ DEBUG( \ #define INVALIDSCCNUM -1 -template<class NodeT> -void ReverseVector(SmallVectorImpl<NodeT *> &Src) { - size_t sz = Src.size(); - for (size_t i = 0; i < sz/2; ++i) { - NodeT *t = Src[i]; - Src[i] = Src[sz - i - 1]; - Src[sz - i - 1] = t; - } -} - -} // end anonymous namespace - //===----------------------------------------------------------------------===// // // supporting data structure for CFGStructurizer // //===----------------------------------------------------------------------===// - -namespace { - class BlockInformation { public: - bool IsRetired; - int SccNum; - BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {} -}; + bool IsRetired = false; + int SccNum = INVALIDSCCNUM; -} // end anonymous namespace + BlockInformation() = default; +}; //===----------------------------------------------------------------------===// // @@ -118,7 +119,6 @@ public: // //===----------------------------------------------------------------------===// -namespace { class AMDGPUCFGStructurizer : public MachineFunctionPass { public: typedef SmallVector<MachineBasicBlock *, 32> MBBVector; @@ -133,8 +133,7 @@ public: static char ID; - AMDGPUCFGStructurizer() : - MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) { + AMDGPUCFGStructurizer() : MachineFunctionPass(ID) { initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry()); } @@ -167,7 +166,7 @@ public: MLI = &getAnalysis<MachineLoopInfo>(); DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); MDT = &getAnalysis<MachineDominatorTree>(); - DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr);); + DEBUG(MDT->print(dbgs(), (const Module*)nullptr);); PDT = &getAnalysis<MachinePostDominatorTree>(); DEBUG(PDT->print(dbgs());); prepare(); @@ -180,8 +179,8 @@ protected: MachineDominatorTree *MDT; MachinePostDominatorTree *PDT; MachineLoopInfo *MLI; - const R600InstrInfo *TII; - const R600RegisterInfo *TRI; + const R600InstrInfo *TII = nullptr; + const R600RegisterInfo *TRI = nullptr; // PRINT FUNCTIONS /// Print the ordered Blocks. @@ -198,6 +197,7 @@ protected: } } } + static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) { for (MachineLoop::iterator iter = LoopInfo.begin(), iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) { @@ -263,7 +263,6 @@ protected: MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk); static void wrapup(MachineBasicBlock *MBB); - int patternMatch(MachineBasicBlock *MBB); int patternMatchGroup(MachineBasicBlock *MBB); int serialPatternMatch(MachineBasicBlock *MBB); @@ -328,7 +327,6 @@ protected: void recordSccnum(MachineBasicBlock *MBB, int SCCNum); void retireBlock(MachineBasicBlock *MBB); - private: MBBInfoMap BlockInfoMap; LoopLandInfoMap LLInfoMap; @@ -337,6 +335,10 @@ private: SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks; }; +char AMDGPUCFGStructurizer::ID = 0; + +} // end anonymous namespace + int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const { MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); if (It == BlockInfoMap.end()) @@ -379,6 +381,7 @@ bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const { } return false; } + AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo( MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, bool AllowSideEntry) const { @@ -697,10 +700,8 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but // there isn't such an interface yet. alternatively, replace all the other // blocks in the jump table with the entryBlk //} - } - bool AMDGPUCFGStructurizer::prepare() { bool Changed = false; @@ -748,7 +749,6 @@ bool AMDGPUCFGStructurizer::prepare() { } bool AMDGPUCFGStructurizer::run() { - //Assume reducible CFG... DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n"); @@ -886,8 +886,6 @@ bool AMDGPUCFGStructurizer::run() { return true; } - - void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { int SccNum = 0; MachineBasicBlock *MBB; @@ -903,11 +901,8 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { } } - //walk through all the block in func to check for unreachable - typedef GraphTraits<MachineFunction *> GTM; - auto It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF); - for (; It != E; ++It) { - MachineBasicBlock *MBB = *It; + // walk through all the block in func to check for unreachable + for (auto *MBB : nodes(MF)) { SccNum = getSCCNum(MBB); if (SccNum == INVALIDSCCNUM) dbgs() << "unreachable block BB" << MBB->getNumber() << "\n"; @@ -941,7 +936,6 @@ int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) { return NumMatch; } - int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) { if (MBB->succ_size() != 1) return 0; @@ -1039,7 +1033,7 @@ int AMDGPUCFGStructurizer::loopendPatternMatch() { for (MachineLoop *ML : depth_first(It)) NestedLoops.push_front(ML); - if (NestedLoops.size() == 0) + if (NestedLoops.empty()) return 0; // Process nested loop outside->inside (we did push_front), @@ -1074,13 +1068,9 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { MachineBasicBlock *ExitBlk = *ExitBlks.begin(); assert(ExitBlk && "Loop has several exit block"); MBBVector LatchBlks; - typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits; - InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader), - PE = InvMBBTraits::child_end(LoopHeader); - for (; PI != PE; PI++) { - if (LoopRep->contains(*PI)) - LatchBlks.push_back(*PI); - } + for (auto *LB : inverse_children<MachineBasicBlock*>(LoopHeader)) + if (LoopRep->contains(LB)) + LatchBlks.push_back(LB); for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i) mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk); @@ -1217,7 +1207,7 @@ void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf( } } - dbgs() << "\n"; + dbgs() << "\n"; } int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, @@ -1478,7 +1468,6 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, if (LandMBB && TrueMBB && FalseMBB) MBB->addSuccessor(LandMBB); - } void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, @@ -1491,7 +1480,6 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, DstBlk->replaceSuccessor(DstBlk, LandMBB); } - void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, MachineBasicBlock *LandMBB) { DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber() @@ -1727,11 +1715,6 @@ void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { && "can't retire block yet"); } -char AMDGPUCFGStructurizer::ID = 0; - -} // end anonymous namespace - - INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer", "AMDGPU CFG Structurizer", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 3cf9a1d..b37c2741 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -11,17 +11,19 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "SIDefines.h" +#include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDKernelCodeTUtils.h" -#include "Utils/AMDGPUAsmUtils.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallString.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineValueType.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -39,15 +41,11 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Support/MathExtras.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -56,7 +54,6 @@ #include <map> #include <memory> #include <string> -#include <vector> using namespace llvm; using namespace llvm::AMDGPU; @@ -83,7 +80,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { const AMDGPUAsmParser *AsmParser; public: - AMDGPUOperand(enum KindTy Kind_, const AMDGPUAsmParser *AsmParser_) + AMDGPUOperand(KindTy Kind_, const AMDGPUAsmParser *AsmParser_) : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {} typedef std::unique_ptr<AMDGPUOperand> Ptr; @@ -155,12 +152,19 @@ public: ImmTyExpTgt, ImmTyExpCompr, ImmTyExpVM, + ImmTyDFMT, + ImmTyNFMT, ImmTyHwreg, ImmTyOff, ImmTySendMsg, ImmTyInterpSlot, ImmTyInterpAttr, - ImmTyAttrChan + ImmTyAttrChan, + ImmTyOpSel, + ImmTyOpSelHi, + ImmTyNegLo, + ImmTyNegHi, + ImmTySwizzle }; struct TokOp { @@ -258,6 +262,8 @@ public: return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID); } + bool isSDWARegKind() const; + bool isImmTy(ImmTy ImmT) const { return isImm() && Imm.Type == ImmT; } @@ -283,10 +289,15 @@ public: bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); } bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); } bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); } + + bool isOffsetU12() const { return isImmTy(ImmTyOffset) && isUInt<12>(getImm()); } + bool isOffsetS13() const { return isImmTy(ImmTyOffset) && isInt<13>(getImm()); } bool isGDS() const { return isImmTy(ImmTyGDS); } bool isGLC() const { return isImmTy(ImmTyGLC); } bool isSLC() const { return isImmTy(ImmTySLC); } bool isTFE() const { return isImmTy(ImmTyTFE); } + bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); } + bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); } bool isBankMask() const { return isImmTy(ImmTyDppBankMask); } bool isRowMask() const { return isImmTy(ImmTyDppRowMask); } bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); } @@ -297,6 +308,10 @@ public: bool isInterpSlot() const { return isImmTy(ImmTyInterpSlot); } bool isInterpAttr() const { return isImmTy(ImmTyInterpAttr); } bool isAttrChan() const { return isImmTy(ImmTyAttrChan); } + bool isOpSel() const { return isImmTy(ImmTyOpSel); } + bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); } + bool isNegLo() const { return isImmTy(ImmTyNegLo); } + bool isNegHi() const { return isImmTy(ImmTyNegHi); } bool isMod() const { return isClampSI() || isOModSI(); @@ -316,6 +331,10 @@ public: return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16); } + bool isSCSrcV2B16() const { + return isSCSrcB16(); + } + bool isSCSrcB32() const { return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32); } @@ -328,6 +347,10 @@ public: return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16); } + bool isSCSrcV2F16() const { + return isSCSrcF16(); + } + bool isSCSrcF32() const { return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32); } @@ -344,6 +367,11 @@ public: return isSCSrcB16() || isLiteralImm(MVT::i16); } + bool isSSrcV2B16() const { + llvm_unreachable("cannot happen"); + return isSSrcB16(); + } + bool isSSrcB64() const { // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits. // See isVSrc64(). @@ -362,6 +390,11 @@ public: return isSCSrcB16() || isLiteralImm(MVT::f16); } + bool isSSrcV2F16() const { + llvm_unreachable("cannot happen"); + return isSSrcF16(); + } + bool isVCSrcB32() const { return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32); } @@ -374,6 +407,10 @@ public: return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16); } + bool isVCSrcV2B16() const { + return isVCSrcB16(); + } + bool isVCSrcF32() const { return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32); } @@ -386,6 +423,10 @@ public: return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16); } + bool isVCSrcV2F16() const { + return isVCSrcF16(); + } + bool isVSrcB32() const { return isVCSrcF32() || isLiteralImm(MVT::i32); } @@ -398,6 +439,11 @@ public: return isVCSrcF16() || isLiteralImm(MVT::i16); } + bool isVSrcV2B16() const { + llvm_unreachable("cannot happen"); + return isVSrcB16(); + } + bool isVSrcF32() const { return isVCSrcF32() || isLiteralImm(MVT::f32); } @@ -410,6 +456,11 @@ public: return isVCSrcF16() || isLiteralImm(MVT::f16); } + bool isVSrcV2F16() const { + llvm_unreachable("cannot happen"); + return isVSrcF16(); + } + bool isKImmFP32() const { return isLiteralImm(MVT::f32); } @@ -433,11 +484,14 @@ public: bool isSWaitCnt() const; bool isHwreg() const; bool isSendMsg() const; + bool isSwizzle() const; bool isSMRDOffset8() const; bool isSMRDOffset20() const; bool isSMRDLiteralOffset() const; bool isDPPCtrl() const; bool isGPRIdxMode() const; + bool isS16Imm() const; + bool isU16Imm() const; StringRef getExpressionAsToken() const { assert(isExpr()); @@ -459,7 +513,7 @@ public: return Imm.Val; } - enum ImmTy getImmTy() const { + ImmTy getImmTy() const { assert(isImm()); return Imm.Type; } @@ -501,9 +555,11 @@ public: return getModifiers().hasIntModifiers(); } + uint64_t applyInputFPModifiers(uint64_t Val, unsigned Size) const; + void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const; - void addLiteralImmOperand(MCInst &Inst, int64_t Val) const; + void addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyModifiers) const; template <unsigned Bitwidth> void addKImmFPOperands(MCInst &Inst, unsigned N) const; @@ -586,6 +642,8 @@ public: case ImmTyGLC: OS << "GLC"; break; case ImmTySLC: OS << "SLC"; break; case ImmTyTFE: OS << "TFE"; break; + case ImmTyDFMT: OS << "DFMT"; break; + case ImmTyNFMT: OS << "NFMT"; break; case ImmTyClampSI: OS << "ClampSI"; break; case ImmTyOModSI: OS << "OModSI"; break; case ImmTyDppCtrl: OS << "DppCtrl"; break; @@ -610,6 +668,11 @@ public: case ImmTyInterpSlot: OS << "InterpSlot"; break; case ImmTyInterpAttr: OS << "InterpAttr"; break; case ImmTyAttrChan: OS << "AttrChan"; break; + case ImmTyOpSel: OS << "OpSel"; break; + case ImmTyOpSelHi: OS << "OpSelHi"; break; + case ImmTyNegLo: OS << "NegLo"; break; + case ImmTyNegHi: OS << "NegHi"; break; + case ImmTySwizzle: OS << "Swizzle"; break; } } @@ -636,7 +699,7 @@ public: static AMDGPUOperand::Ptr CreateImm(const AMDGPUAsmParser *AsmParser, int64_t Val, SMLoc Loc, - enum ImmTy Type = ImmTyNone, + ImmTy Type = ImmTyNone, bool IsFPImm = false) { auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser); Op->Imm.Val = Val; @@ -695,9 +758,9 @@ raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) { // Kernel scope begins at .amdgpu_hsa_kernel directive, ends at next // .amdgpu_hsa_kernel or at EOF. class KernelScopeInfo { - int SgprIndexUnusedMin; - int VgprIndexUnusedMin; - MCContext *Ctx; + int SgprIndexUnusedMin = -1; + int VgprIndexUnusedMin = -1; + MCContext *Ctx = nullptr; void usesSgprAt(int i) { if (i >= SgprIndexUnusedMin) { @@ -708,6 +771,7 @@ class KernelScopeInfo { } } } + void usesVgprAt(int i) { if (i >= VgprIndexUnusedMin) { VgprIndexUnusedMin = ++i; @@ -717,14 +781,16 @@ class KernelScopeInfo { } } } + public: - KernelScopeInfo() : SgprIndexUnusedMin(-1), VgprIndexUnusedMin(-1), Ctx(nullptr) - {} + KernelScopeInfo() = default; + void initialize(MCContext &Context) { Ctx = &Context; usesSgprAt(SgprIndexUnusedMin = -1); usesVgprAt(VgprIndexUnusedMin = -1); } + void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) { switch (RegKind) { case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break; @@ -738,9 +804,9 @@ class AMDGPUAsmParser : public MCTargetAsmParser { const MCInstrInfo &MII; MCAsmParser &Parser; - unsigned ForcedEncodingSize; - bool ForcedDPP; - bool ForcedSDWA; + unsigned ForcedEncodingSize = 0; + bool ForcedDPP = false; + bool ForcedSDWA = false; KernelScopeInfo KernelScope; /// @name Auto-generated Match Functions @@ -756,55 +822,57 @@ private: bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor); bool ParseDirectiveHSACodeObjectVersion(); bool ParseDirectiveHSACodeObjectISA(); - bool ParseDirectiveRuntimeMetadata(); + bool ParseDirectiveCodeObjectMetadata(); bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header); bool ParseDirectiveAMDKernelCodeT(); - bool ParseSectionDirectiveHSAText(); bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const; bool ParseDirectiveAMDGPUHsaKernel(); - bool ParseDirectiveAMDGPUHsaModuleGlobal(); - bool ParseDirectiveAMDGPUHsaProgramGlobal(); - bool ParseSectionDirectiveHSADataGlobalAgent(); - bool ParseSectionDirectiveHSADataGlobalProgram(); - bool ParseSectionDirectiveHSARodataReadonlyAgent(); - bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum); - bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex); - void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn); + bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, + RegisterKind RegKind, unsigned Reg1, + unsigned RegNum); + bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, + unsigned& RegNum, unsigned& RegWidth, + unsigned *DwordRegIndex); + void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, + bool IsAtomic, bool IsAtomicReturn); + void cvtDSImpl(MCInst &Inst, const OperandVector &Operands, + bool IsGdsHardcoded); public: enum AMDGPUMatchResultTy { Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY }; + typedef std::map<AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap; + AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser), - ForcedEncodingSize(0), - ForcedDPP(false), - ForcedSDWA(false) { + : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser) { MCAsmParserExtension::Initialize(Parser); - if (getSTI().getFeatureBits().none()) { + if (getFeatureBits().none()) { // Set default features. copySTI().ToggleFeature("SOUTHERN_ISLANDS"); } - setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits())); { // TODO: make those pre-defined variables read-only. // Currently there is none suitable machinery in the core llvm-mc for this. // MCSymbol::isRedefinable is intended for another purpose, and // AsmParser::parseDirectiveSet() cannot be specialized for specific target. - AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits()); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); MCContext &Ctx = getContext(); - MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); - Sym->setVariableValue(MCConstantExpr::create(Isa.Major, Ctx)); + MCSymbol *Sym = + Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); + Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor")); - Sym->setVariableValue(MCConstantExpr::create(Isa.Minor, Ctx)); + Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx)); Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); - Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx)); + Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); } KernelScope.initialize(getContext()); } @@ -821,8 +889,16 @@ public: return AMDGPU::isVI(getSTI()); } + bool isGFX9() const { + return AMDGPU::isGFX9(getSTI()); + } + bool hasInv2PiInlineImm() const { - return getSTI().getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; + return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; + } + + bool hasFlatOffsets() const { + return getFeatureBits()[AMDGPU::FeatureFlatInstOffsets]; } bool hasSGPR102_SGPR103() const { @@ -844,6 +920,10 @@ public: return &MII; } + const FeatureBitset &getFeatureBits() const { + return getSTI().getFeatureBits(); + } + void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; } void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; } void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; } @@ -871,19 +951,28 @@ public: //bool ProcessInstruction(MCInst &Inst); OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int); + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, + AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, bool (*ConvertResult)(int64_t &) = nullptr); + + OperandMatchResultTy parseOperandArrayWithPrefix( + const char *Prefix, + OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, + bool (*ConvertResult)(int64_t&) = nullptr); + OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); + AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, StringRef &Value); - OperandMatchResultTy parseImm(OperandVector &Operands); + bool parseAbsoluteExpr(int64_t &Val, bool AbsMod = false); + OperandMatchResultTy parseImm(OperandVector &Operands, bool AbsMod = false); OperandMatchResultTy parseReg(OperandVector &Operands); - OperandMatchResultTy parseRegOrImm(OperandVector &Operands); + OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool AbsMod = false); OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true); OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true); OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands); @@ -891,7 +980,8 @@ public: OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands); void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); - void cvtDS(MCInst &Inst, const OperandVector &Operands); + void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); } + void cvtDSGds(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, true); } void cvtExp(MCInst &Inst, const OperandVector &Operands); bool parseCnt(int64_t &IntVal); @@ -911,6 +1001,19 @@ private: void errorExpTgt(); OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val); + bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc); + bool validateConstantBusLimitations(const MCInst &Inst); + bool validateEarlyClobberLimitations(const MCInst &Inst); + bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); + bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; + unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; + + bool trySkipId(const StringRef Id); + bool trySkipToken(const AsmToken::TokenKind Kind); + bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg); + bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string"); + bool parseExpr(int64_t &Imm); + public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); @@ -920,9 +1023,24 @@ public: OperandMatchResultTy parseInterpAttr(OperandVector &Operands); OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op, + const unsigned MinVal, + const unsigned MaxVal, + const StringRef ErrMsg); + OperandMatchResultTy parseSwizzleOp(OperandVector &Operands); + bool parseSwizzleOffset(int64_t &Imm); + bool parseSwizzleMacro(int64_t &Imm); + bool parseSwizzleQuadPerm(int64_t &Imm); + bool parseSwizzleBitmaskPerm(int64_t &Imm); + bool parseSwizzleBroadcast(int64_t &Imm); + bool parseSwizzleSwap(int64_t &Imm); + bool parseSwizzleReverse(int64_t &Imm); + void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); } void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); } + void cvtMtbuf(MCInst &Inst, const OperandVector &Operands); + AMDGPUOperand::Ptr defaultGLC() const; AMDGPUOperand::Ptr defaultSLC() const; AMDGPUOperand::Ptr defaultTFE() const; @@ -935,14 +1053,18 @@ public: AMDGPUOperand::Ptr defaultSMRDOffset8() const; AMDGPUOperand::Ptr defaultSMRDOffset20() const; AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; + AMDGPUOperand::Ptr defaultOffsetU12() const; + AMDGPUOperand::Ptr defaultOffsetS13() const; OperandMatchResultTy parseOModOperand(OperandVector &Operands); - void cvtId(MCInst &Inst, const OperandVector &Operands); - void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands); + void cvtVOP3(MCInst &Inst, const OperandVector &Operands, + OptionalImmIndexMap &OptionalIdx); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); + void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); - void cvtMIMG(MCInst &Inst, const OperandVector &Operands); + void cvtMIMG(MCInst &Inst, const OperandVector &Operands, + bool IsAtomic = false); void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); OperandMatchResultTy parseDPPCtrl(OperandVector &Operands); @@ -956,9 +1078,10 @@ public: OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands); void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands); + void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands); void cvtSDWA(MCInst &Inst, const OperandVector &Operands, - uint64_t BasicInstType); + uint64_t BasicInstType, bool skipVcc = false); }; struct OptionalOperand { @@ -988,6 +1111,30 @@ static const fltSemantics *getFltSemantics(MVT VT) { return getFltSemantics(VT.getSizeInBits() / 8); } +static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { + switch (OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + return &APFloat::IEEEsingle(); + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + return &APFloat::IEEEdouble(); + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + return &APFloat::IEEEhalf(); + default: + llvm_unreachable("unsupported fp type"); + } +} + //===----------------------------------------------------------------------===// // Operand //===----------------------------------------------------------------------===// @@ -1031,13 +1178,18 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const { if (!canLosslesslyConvertToFPType(FPLiteral, type)) return false; + if (type.getScalarSizeInBits() == 16) { + return AMDGPU::isInlinableLiteral16( + static_cast<int16_t>(FPLiteral.bitcastToAPInt().getZExtValue()), + AsmParser->hasInv2PiInlineImm()); + } + // Check if single precision literal is inlinable return AMDGPU::isInlinableLiteral32( static_cast<int32_t>(FPLiteral.bitcastToAPInt().getZExtValue()), AsmParser->hasInv2PiInlineImm()); } - // We got int literal token. if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand return AMDGPU::isInlinableLiteral64(Imm.Val, @@ -1056,7 +1208,7 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const { } bool AMDGPUOperand::isLiteralImm(MVT type) const { - // Check that this imediate can be added as literal + // Check that this immediate can be added as literal if (!isImmTy(ImmTyNone)) { return false; } @@ -1064,6 +1216,13 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { if (!Imm.IsFPImm) { // We got int literal token. + if (type == MVT::f64 && hasFPModifiers()) { + // Cannot apply fp modifiers to int literals preserving the same semantics + // for VOP1/2/C and VOP3 because of integer truncation. To avoid ambiguity, + // disable these cases. + return false; + } + unsigned Size = type.getSizeInBits(); if (Size == 64) Size = 32; @@ -1093,40 +1252,66 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const { return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg()); } -void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const { - int64_t Val = Imm.Val; - if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers() && Imm.Mods.Neg) { - // Apply modifiers to immediate value. Only negate can get here - if (Imm.IsFPImm) { - APFloat F(BitsToDouble(Val)); - F.changeSign(); - Val = F.bitcastToAPInt().getZExtValue(); - } else { - Val = -Val; - } +bool AMDGPUOperand::isSDWARegKind() const { + if (AsmParser->isVI()) + return isVReg(); + else if (AsmParser->isGFX9()) + return isRegKind(); + else + return false; +} + +uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const +{ + assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers()); + assert(Size == 2 || Size == 4 || Size == 8); + + const uint64_t FpSignMask = (1ULL << (Size * 8 - 1)); + + if (Imm.Mods.Abs) { + Val &= ~FpSignMask; + } + if (Imm.Mods.Neg) { + Val ^= FpSignMask; } + return Val; +} + +void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const { + if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()), Inst.getNumOperands())) { - addLiteralImmOperand(Inst, Val); + addLiteralImmOperand(Inst, Imm.Val, + ApplyModifiers & + isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers()); } else { - Inst.addOperand(MCOperand::createImm(Val)); + assert(!isImmTy(ImmTyNone) || !hasModifiers()); + Inst.addOperand(MCOperand::createImm(Imm.Val)); } } -void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { +void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyModifiers) const { const auto& InstDesc = AsmParser->getMII()->get(Inst.getOpcode()); auto OpNum = Inst.getNumOperands(); // Check that this operand accepts literals assert(AMDGPU::isSISrcOperand(InstDesc, OpNum)); - auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size + if (ApplyModifiers) { + assert(AMDGPU::isSISrcFPOperand(InstDesc, OpNum)); + const unsigned Size = Imm.IsFPImm ? sizeof(double) : getOperandSize(InstDesc, OpNum); + Val = applyInputFPModifiers(Val, Size); + } + + APInt Literal(64, Val); + uint8_t OpTy = InstDesc.OpInfo[OpNum].OperandType; if (Imm.IsFPImm) { // We got fp literal token - APInt Literal(64, Val); - - switch (OpSize) { - case 8: { + switch (OpTy) { + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: { if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Literal.getZExtValue())); @@ -1151,16 +1336,31 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { // in predicate methods (isLiteralImm()) llvm_unreachable("fp literal in 64-bit integer instruction."); } - case 4: - case 2: { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { bool lost; APFloat FPLiteral(APFloat::IEEEdouble(), Literal); // Convert literal to single precision - FPLiteral.convert(*getFltSemantics(OpSize), + FPLiteral.convert(*getOpFltSemantics(OpTy), APFloat::rmNearestTiesToEven, &lost); // We allow precision lost but not overflow or underflow. This should be // checked earlier in isLiteralImm() - Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue())); + + uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue(); + if (OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || + OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) { + ImmVal |= (ImmVal << 16); + } + + Inst.addOperand(MCOperand::createImm(ImmVal)); return; } default: @@ -1173,8 +1373,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { // We got int literal token. // Only sign extend inline immediates. // FIXME: No errors on truncation - switch (OpSize) { - case 4: { + switch (OpTy) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: { if (isInt<32>(Val) && AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val), AsmParser->hasInv2PiInlineImm())) { @@ -1185,9 +1388,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { Inst.addOperand(MCOperand::createImm(Val & 0xffffffff)); return; } - case 8: { - if (AMDGPU::isInlinableLiteral64(Val, - AsmParser->hasInv2PiInlineImm())) { + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: { + if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); return; } @@ -1195,7 +1400,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { Inst.addOperand(MCOperand::createImm(Lo_32(Val))); return; } - case 2: { + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: { if (isInt<16>(Val) && AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val), AsmParser->hasInv2PiInlineImm())) { @@ -1206,6 +1414,17 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { Inst.addOperand(MCOperand::createImm(Val & 0xffff)); return; } + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + auto LiteralVal = static_cast<uint16_t>(Literal.getLoBits(16).getZExtValue()); + assert(AMDGPU::isInlinableLiteral16(LiteralVal, + AsmParser->hasInv2PiInlineImm())); + + uint32_t ImmVal = static_cast<uint32_t>(LiteralVal) << 16 | + static_cast<uint32_t>(LiteralVal); + Inst.addOperand(MCOperand::createImm(ImmVal)); + return; + } default: llvm_unreachable("invalid operand size"); } @@ -1289,7 +1508,8 @@ static unsigned getSpecialRegForName(StringRef RegName) { .Default(0); } -bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { +bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc) { auto R = parseRegister(); if (!R) return true; assert(R->isReg()); @@ -1299,20 +1519,43 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End return false; } -bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum) -{ +bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, + RegisterKind RegKind, unsigned Reg1, + unsigned RegNum) { switch (RegKind) { case IS_SPECIAL: - if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { Reg = AMDGPU::EXEC; RegWidth = 2; return true; } - if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { Reg = AMDGPU::FLAT_SCR; RegWidth = 2; return true; } - if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { Reg = AMDGPU::VCC; RegWidth = 2; return true; } - if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { Reg = AMDGPU::TBA; RegWidth = 2; return true; } - if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { Reg = AMDGPU::TMA; RegWidth = 2; return true; } + if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { + Reg = AMDGPU::EXEC; + RegWidth = 2; + return true; + } + if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { + Reg = AMDGPU::FLAT_SCR; + RegWidth = 2; + return true; + } + if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { + Reg = AMDGPU::VCC; + RegWidth = 2; + return true; + } + if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { + Reg = AMDGPU::TBA; + RegWidth = 2; + return true; + } + if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { + Reg = AMDGPU::TMA; + RegWidth = 2; + return true; + } return false; case IS_VGPR: case IS_SGPR: case IS_TTMP: - if (Reg1 != Reg + RegWidth) { return false; } + if (Reg1 != Reg + RegWidth) { + return false; + } RegWidth++; return true; default: @@ -1320,8 +1563,9 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, R } } -bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex) -{ +bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, + unsigned &RegNum, unsigned &RegWidth, + unsigned *DwordRegIndex) { if (DwordRegIndex) { *DwordRegIndex = 0; } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); if (getLexer().is(AsmToken::Identifier)) { @@ -1462,8 +1706,33 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() { return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false); } +bool +AMDGPUAsmParser::parseAbsoluteExpr(int64_t &Val, bool AbsMod) { + if (AbsMod && getLexer().peekTok().is(AsmToken::Pipe) && + (getLexer().getKind() == AsmToken::Integer || + getLexer().getKind() == AsmToken::Real)) { + + // This is a workaround for handling operands like these: + // |1.0| + // |-1| + // This syntax is not compatible with syntax of standard + // MC expressions (due to the trailing '|'). + + SMLoc EndLoc; + const MCExpr *Expr; + + if (getParser().parsePrimaryExpr(Expr, EndLoc)) { + return true; + } + + return !Expr->evaluateAsAbsolute(Val); + } + + return getParser().parseAbsoluteExpression(Val); +} + OperandMatchResultTy -AMDGPUAsmParser::parseImm(OperandVector &Operands) { +AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) { // TODO: add syntactic sugar for 1/(2*PI) bool Minus = false; if (getLexer().getKind() == AsmToken::Minus) { @@ -1475,7 +1744,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) { switch(getLexer().getKind()) { case AsmToken::Integer: { int64_t IntVal; - if (getParser().parseAbsoluteExpression(IntVal)) + if (parseAbsoluteExpr(IntVal, AbsMod)) return MatchOperand_ParseFail; if (Minus) IntVal *= -1; @@ -1484,7 +1753,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) { } case AsmToken::Real: { int64_t IntVal; - if (getParser().parseAbsoluteExpression(IntVal)) + if (parseAbsoluteExpr(IntVal, AbsMod)) return MatchOperand_ParseFail; APFloat F(BitsToDouble(IntVal)); @@ -1512,8 +1781,8 @@ AMDGPUAsmParser::parseReg(OperandVector &Operands) { } OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) { - auto res = parseImm(Operands); +AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool AbsMod) { + auto res = parseImm(Operands, AbsMod); if (res != MatchOperand_NoMatch) { return res; } @@ -1522,18 +1791,50 @@ AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) { } OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) { - // XXX: During parsing we can't determine if minus sign means - // negate-modifier or negative immediate value. - // By default we suppose it is modifier. - bool Negate = false, Abs = false, Abs2 = false; +AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, + bool AllowImm) { + bool Negate = false, Negate2 = false, Abs = false, Abs2 = false; if (getLexer().getKind()== AsmToken::Minus) { + const AsmToken NextToken = getLexer().peekTok(); + + // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead. + if (NextToken.is(AsmToken::Minus)) { + Error(Parser.getTok().getLoc(), "invalid syntax, expected 'neg' modifier"); + return MatchOperand_ParseFail; + } + + // '-' followed by an integer literal N should be interpreted as integer + // negation rather than a floating-point NEG modifier applied to N. + // Beside being contr-intuitive, such use of floating-point NEG modifier + // results in different meaning of integer literals used with VOP1/2/C + // and VOP3, for example: + // v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF + // v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001 + // Negative fp literals should be handled likewise for unifomtity + if (!NextToken.is(AsmToken::Integer) && !NextToken.is(AsmToken::Real)) { + Parser.Lex(); + Negate = true; + } + } + + if (getLexer().getKind() == AsmToken::Identifier && + Parser.getTok().getString() == "neg") { + if (Negate) { + Error(Parser.getTok().getLoc(), "expected register or immediate"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + Negate2 = true; + if (getLexer().isNot(AsmToken::LParen)) { + Error(Parser.getTok().getLoc(), "expected left paren after neg"); + return MatchOperand_ParseFail; + } Parser.Lex(); - Negate = true; } - if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "abs") { + if (getLexer().getKind() == AsmToken::Identifier && + Parser.getTok().getString() == "abs") { Parser.Lex(); Abs2 = true; if (getLexer().isNot(AsmToken::LParen)) { @@ -1554,7 +1855,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo OperandMatchResultTy Res; if (AllowImm) { - Res = parseRegOrImm(Operands); + Res = parseRegOrImm(Operands, Abs); } else { Res = parseReg(Operands); } @@ -1563,9 +1864,6 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo } AMDGPUOperand::Modifiers Mods; - if (Negate) { - Mods.Neg = true; - } if (Abs) { if (getLexer().getKind() != AsmToken::Pipe) { Error(Parser.getTok().getLoc(), "expected vertical bar"); @@ -1583,6 +1881,17 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo Mods.Abs = true; } + if (Negate) { + Mods.Neg = true; + } else if (Negate2) { + if (getLexer().isNot(AsmToken::RParen)) { + Error(Parser.getTok().getLoc(), "expected closing parentheses"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + Mods.Neg = true; + } + if (Mods.hasFPModifiers()) { AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back()); Op.setModifiers(Mods); @@ -1591,10 +1900,12 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo } OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) { +AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, + bool AllowImm) { bool Sext = false; - if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") { + if (getLexer().getKind() == AsmToken::Identifier && + Parser.getTok().getString() == "sext") { Parser.Lex(); Sext = true; if (getLexer().isNot(AsmToken::LParen)) { @@ -1661,7 +1972,6 @@ OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) } unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { - uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) || @@ -1686,6 +1996,15 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { } } + if ((TSFlags & SIInstrFlags::FLAT) && !hasFlatOffsets()) { + // FIXME: Produces error without correct column reported. + auto OpNum = + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset); + const auto &Op = Inst.getOperand(OpNum); + if (Op.getImm() != 0) + return Match_InvalidOperand; + } + return Match_Success; } @@ -1702,7 +2021,8 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const { } if (isForcedSDWA()) { - static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA}; + static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA, + AMDGPUAsmVariants::SDWA9}; return makeArrayRef(Variants); } @@ -1713,12 +2033,182 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const { static const unsigned Variants[] = { AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3, - AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP + AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP }; return makeArrayRef(Variants); } +unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + const unsigned Num = Desc.getNumImplicitUses(); + for (unsigned i = 0; i < Num; ++i) { + unsigned Reg = Desc.ImplicitUses[i]; + switch (Reg) { + case AMDGPU::FLAT_SCR: + case AMDGPU::VCC: + case AMDGPU::M0: + return Reg; + default: + break; + } + } + return AMDGPU::NoRegister; +} + +// NB: This code is correct only when used to check constant +// bus limitations because GFX7 support no f16 inline constants. +// Note that there are no cases when a GFX7 opcode violates +// constant bus limitations due to the use of an f16 constant. +bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, + unsigned OpIdx) const { + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + + if (!AMDGPU::isSISrcOperand(Desc, OpIdx)) { + return false; + } + + const MCOperand &MO = Inst.getOperand(OpIdx); + + int64_t Val = MO.getImm(); + auto OpSize = AMDGPU::getOperandSize(Desc, OpIdx); + + switch (OpSize) { // expected operand size + case 8: + return AMDGPU::isInlinableLiteral64(Val, hasInv2PiInlineImm()); + case 4: + return AMDGPU::isInlinableLiteral32(Val, hasInv2PiInlineImm()); + case 2: { + const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType; + if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) { + return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm()); + } else { + return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm()); + } + } + default: + llvm_unreachable("invalid operand size"); + } +} + +bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) { + const MCOperand &MO = Inst.getOperand(OpIdx); + if (MO.isImm()) { + return !isInlineConstant(Inst, OpIdx); + } + return !MO.isReg() || + isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo()); +} + +bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { + const unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opcode); + unsigned ConstantBusUseCount = 0; + + if (Desc.TSFlags & + (SIInstrFlags::VOPC | + SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | + SIInstrFlags::VOP3 | SIInstrFlags::VOP3P | + SIInstrFlags::SDWA)) { + + // Check special imm operands (used by madmk, etc) + if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) { + ++ConstantBusUseCount; + } + + unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst); + if (SGPRUsed != AMDGPU::NoRegister) { + ++ConstantBusUseCount; + } + + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + + const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + + for (int OpIdx : OpIndices) { + if (OpIdx == -1) break; + + const MCOperand &MO = Inst.getOperand(OpIdx); + if (usesConstantBus(Inst, OpIdx)) { + if (MO.isReg()) { + const unsigned Reg = mc2PseudoReg(MO.getReg()); + // Pairs of registers with a partial intersections like these + // s0, s[0:1] + // flat_scratch_lo, flat_scratch + // flat_scratch_lo, flat_scratch_hi + // are theoretically valid but they are disabled anyway. + // Note that this code mimics SIInstrInfo::verifyInstruction + if (Reg != SGPRUsed) { + ++ConstantBusUseCount; + } + SGPRUsed = Reg; + } else { // Expression or a literal + ++ConstantBusUseCount; + } + } + } + } + + return ConstantBusUseCount <= 1; +} + +bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) { + + const unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opcode); + + const int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); + if (DstIdx == -1 || + Desc.getOperandConstraint(DstIdx, MCOI::EARLY_CLOBBER) == -1) { + return true; + } + + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + + assert(DstIdx != -1); + const MCOperand &Dst = Inst.getOperand(DstIdx); + assert(Dst.isReg()); + const unsigned DstReg = mc2PseudoReg(Dst.getReg()); + + const int SrcIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + + for (int SrcIdx : SrcIndices) { + if (SrcIdx == -1) break; + const MCOperand &Src = Inst.getOperand(SrcIdx); + if (Src.isReg()) { + const unsigned SrcReg = mc2PseudoReg(Src.getReg()); + if (isRegIntersect(DstReg, SrcReg, TRI)) { + return false; + } + } + } + + return true; +} + +bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, + const SMLoc &IDLoc) { + if (!validateConstantBusLimitations(Inst)) { + Error(IDLoc, + "invalid operand (violates constant bus restrictions)"); + return false; + } + if (!validateEarlyClobberLimitations(Inst)) { + Error(IDLoc, + "destination must be different than all sources"); + return false; + } + + return true; +} + bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -1751,6 +2241,9 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, switch (Result) { default: break; case Match_Success: + if (!validateInstruction(Inst, IDLoc)) { + return true; + } Inst.setLoc(IDLoc); Out.EmitInstruction(Inst, getSTI()); return false; @@ -1793,7 +2286,6 @@ bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) { return false; } - bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor) { if (ParseAsAbsoluteExpression(Major)) @@ -1810,7 +2302,6 @@ bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major, } bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() { - uint32_t Major; uint32_t Minor; @@ -1831,9 +2322,10 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { // If this directive has no arguments, then use the ISA version for the // targeted GPU. if (getLexer().is(AsmToken::EndOfStatement)) { - AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits()); - getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor, - Isa.Stepping, + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); + getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, + ISA.Stepping, "AMD", "AMDGPU"); return false; } @@ -1873,42 +2365,45 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { return false; } -bool AMDGPUAsmParser::ParseDirectiveRuntimeMetadata() { - std::string Metadata; - raw_string_ostream MS(Metadata); +bool AMDGPUAsmParser::ParseDirectiveCodeObjectMetadata() { + std::string YamlString; + raw_string_ostream YamlStream(YamlString); getLexer().setSkipSpace(false); bool FoundEnd = false; while (!getLexer().is(AsmToken::Eof)) { while (getLexer().is(AsmToken::Space)) { - MS << ' '; + YamlStream << getLexer().getTok().getString(); Lex(); } if (getLexer().is(AsmToken::Identifier)) { StringRef ID = getLexer().getTok().getIdentifier(); - if (ID == ".end_amdgpu_runtime_metadata") { + if (ID == AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd) { Lex(); FoundEnd = true; break; } } - MS << Parser.parseStringToEndOfStatement() - << getContext().getAsmInfo()->getSeparatorString(); + YamlStream << Parser.parseStringToEndOfStatement() + << getContext().getAsmInfo()->getSeparatorString(); Parser.eatToEndOfStatement(); } getLexer().setSkipSpace(true); - if (getLexer().is(AsmToken::Eof) && !FoundEnd) - return TokError("expected directive .end_amdgpu_runtime_metadata not found"); + if (getLexer().is(AsmToken::Eof) && !FoundEnd) { + return TokError( + "expected directive .end_amdgpu_code_object_metadata not found"); + } - MS.flush(); + YamlStream.flush(); - getTargetStreamer().EmitRuntimeMetadata(Metadata); + if (!getTargetStreamer().EmitCodeObjectMetadata(YamlString)) + return Error(getParser().getTok().getLoc(), "invalid code object metadata"); return false; } @@ -1926,7 +2421,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { amd_kernel_code_t Header; - AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits()); + AMDGPU::initDefaultAMDKernelCodeT(Header, getFeatureBits()); while (true) { // Lex EndOfStatement. This is in a while loop, because lexing a comment @@ -1952,12 +2447,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { return false; } -bool AMDGPUAsmParser::ParseSectionDirectiveHSAText() { - getParser().getStreamer().SwitchSection( - AMDGPU::getHSATextSection(getContext())); - return false; -} - bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() { if (getLexer().isNot(AsmToken::Identifier)) return TokError("expected symbol name"); @@ -1971,46 +2460,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() { return false; } -bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() { - if (getLexer().isNot(AsmToken::Identifier)) - return TokError("expected symbol name"); - - StringRef GlobalName = Parser.getTok().getIdentifier(); - - getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName); - Lex(); - return false; -} - -bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() { - if (getLexer().isNot(AsmToken::Identifier)) - return TokError("expected symbol name"); - - StringRef GlobalName = Parser.getTok().getIdentifier(); - - getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName); - Lex(); - return false; -} - -bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() { - getParser().getStreamer().SwitchSection( - AMDGPU::getHSADataGlobalAgentSection(getContext())); - return false; -} - -bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() { - getParser().getStreamer().SwitchSection( - AMDGPU::getHSADataGlobalProgramSection(getContext())); - return false; -} - -bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() { - getParser().getStreamer().SwitchSection( - AMDGPU::getHSARodataReadonlyAgentSection(getContext())); - return false; -} - bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); @@ -2020,33 +2469,15 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".hsa_code_object_isa") return ParseDirectiveHSACodeObjectISA(); - if (IDVal == ".amdgpu_runtime_metadata") - return ParseDirectiveRuntimeMetadata(); + if (IDVal == AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin) + return ParseDirectiveCodeObjectMetadata(); if (IDVal == ".amd_kernel_code_t") return ParseDirectiveAMDKernelCodeT(); - if (IDVal == ".hsatext") - return ParseSectionDirectiveHSAText(); - if (IDVal == ".amdgpu_hsa_kernel") return ParseDirectiveAMDGPUHsaKernel(); - if (IDVal == ".amdgpu_hsa_module_global") - return ParseDirectiveAMDGPUHsaModuleGlobal(); - - if (IDVal == ".amdgpu_hsa_program_global") - return ParseDirectiveAMDGPUHsaProgramGlobal(); - - if (IDVal == ".hsadata_global_agent") - return ParseSectionDirectiveHSADataGlobalAgent(); - - if (IDVal == ".hsadata_global_program") - return ParseSectionDirectiveHSADataGlobalProgram(); - - if (IDVal == ".hsarodata_readonly_agent") - return ParseSectionDirectiveHSARodataReadonlyAgent(); - return true; } @@ -2080,7 +2511,6 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, OperandMatchResultTy AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { - // Try to parse with a custom parser OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); @@ -2195,11 +2625,21 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) { return MatchOperand_ParseFail; Parser.Lex(); + + bool IsMinus = false; + if (getLexer().getKind() == AsmToken::Minus) { + Parser.Lex(); + IsMinus = true; + } + if (getLexer().isNot(AsmToken::Integer)) return MatchOperand_ParseFail; if (getParser().parseAbsoluteExpression(Int)) return MatchOperand_ParseFail; + + if (IsMinus) + Int = -Int; break; } } @@ -2208,7 +2648,7 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) { OperandMatchResultTy AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy, + AMDGPUOperand::ImmTy ImmTy, bool (*ConvertResult)(int64_t&)) { SMLoc S = Parser.getTok().getLoc(); int64_t Value = 0; @@ -2225,9 +2665,59 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, return MatchOperand_Success; } +OperandMatchResultTy AMDGPUAsmParser::parseOperandArrayWithPrefix( + const char *Prefix, + OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy, + bool (*ConvertResult)(int64_t&)) { + StringRef Name = Parser.getTok().getString(); + if (!Name.equals(Prefix)) + return MatchOperand_NoMatch; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::LBrac)) + return MatchOperand_ParseFail; + Parser.Lex(); + + unsigned Val = 0; + SMLoc S = Parser.getTok().getLoc(); + + // FIXME: How to verify the number of elements matches the number of src + // operands? + for (int I = 0; I < 3; ++I) { + if (I != 0) { + if (getLexer().is(AsmToken::RBrac)) + break; + + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + Parser.Lex(); + } + + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + + int64_t Op; + if (getParser().parseAbsoluteExpression(Op)) + return MatchOperand_ParseFail; + + if (Op != 0 && Op != 1) + return MatchOperand_ParseFail; + Val |= (Op << I); + } + + Parser.Lex(); + Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy)); + return MatchOperand_Success; +} + OperandMatchResultTy AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy) { + AMDGPUOperand::ImmTy ImmTy) { int64_t Bit = 0; SMLoc S = Parser.getTok().getLoc(); @@ -2257,11 +2747,11 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, return MatchOperand_Success; } -typedef std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap; - -void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands, - OptionalImmIndexMap& OptionalIdx, - enum AMDGPUOperand::ImmTy ImmT, int64_t Default = 0) { +static void addOptionalImmOperand( + MCInst& Inst, const OperandVector& Operands, + AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx, + AMDGPUOperand::ImmTy ImmT, + int64_t Default = 0) { auto i = OptionalIdx.find(ImmT); if (i != OptionalIdx.end()) { unsigned Idx = i->second; @@ -2323,9 +2813,9 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 } -void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { - std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; - bool GDSOnly = false; +void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands, + bool IsGdsHardcoded) { + OptionalImmIndexMap OptionalIdx; for (unsigned i = 1, e = Operands.size(); i != e; ++i) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); @@ -2337,7 +2827,7 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { } if (Op.isToken() && Op.getToken() == "gds") { - GDSOnly = true; + IsGdsHardcoded = true; continue; } @@ -2345,10 +2835,14 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { OptionalIdx[Op.getImmTy()] = i; } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); + AMDGPUOperand::ImmTy OffsetType = + (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_si || + Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle : + AMDGPUOperand::ImmTyOffset; - if (!GDSOnly) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, OffsetType); + + if (!IsGdsHardcoded) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); } Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 @@ -2357,6 +2851,7 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; + unsigned OperandIdx[4]; unsigned EnMask = 0; int SrcIdx = 0; @@ -2365,15 +2860,18 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { // Add the register arguments if (Op.isReg()) { - EnMask |= (1 << SrcIdx); + assert(SrcIdx < 4); + OperandIdx[SrcIdx] = Inst.size(); Op.addRegOperands(Inst, 1); ++SrcIdx; continue; } if (Op.isOff()) { - ++SrcIdx; + assert(SrcIdx < 4); + OperandIdx[SrcIdx] = Inst.size(); Inst.addOperand(MCOperand::createReg(AMDGPU::NoRegister)); + ++SrcIdx; continue; } @@ -2389,6 +2887,22 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { OptionalIdx[Op.getImmTy()] = i; } + assert(SrcIdx == 4); + + bool Compr = false; + if (OptionalIdx.find(AMDGPUOperand::ImmTyExpCompr) != OptionalIdx.end()) { + Compr = true; + Inst.getOperand(OperandIdx[1]) = Inst.getOperand(OperandIdx[2]); + Inst.getOperand(OperandIdx[2]).setReg(AMDGPU::NoRegister); + Inst.getOperand(OperandIdx[3]).setReg(AMDGPU::NoRegister); + } + + for (auto i = 0; i < SrcIdx; ++i) { + if (Inst.getOperand(OperandIdx[i]).getReg() != AMDGPU::NoRegister) { + EnMask |= Compr? (0x3 << i * 2) : (0x1 << i); + } + } + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpVM); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpCompr); @@ -2399,6 +2913,28 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { // s_waitcnt //===----------------------------------------------------------------------===// +static bool +encodeCnt( + const AMDGPU::IsaInfo::IsaVersion ISA, + int64_t &IntVal, + int64_t CntVal, + bool Saturate, + unsigned (*encode)(const IsaInfo::IsaVersion &Version, unsigned, unsigned), + unsigned (*decode)(const IsaInfo::IsaVersion &Version, unsigned)) +{ + bool Failed = false; + + IntVal = encode(ISA, IntVal, CntVal); + if (CntVal != decode(ISA, IntVal)) { + if (Saturate) { + IntVal = encode(ISA, IntVal, -1); + } else { + Failed = true; + } + } + return Failed; +} + bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { StringRef CntName = Parser.getTok().getString(); int64_t CntVal; @@ -2411,33 +2947,49 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { if (getLexer().isNot(AsmToken::Integer)) return true; + SMLoc ValLoc = Parser.getTok().getLoc(); if (getParser().parseAbsoluteExpression(CntVal)) return true; - if (getLexer().isNot(AsmToken::RParen)) - return true; + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); - Parser.Lex(); - if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) - Parser.Lex(); + bool Failed = true; + bool Sat = CntName.endswith("_sat"); - IsaVersion IV = getIsaVersion(getSTI().getFeatureBits()); - if (CntName == "vmcnt") - IntVal = encodeVmcnt(IV, IntVal, CntVal); - else if (CntName == "expcnt") - IntVal = encodeExpcnt(IV, IntVal, CntVal); - else if (CntName == "lgkmcnt") - IntVal = encodeLgkmcnt(IV, IntVal, CntVal); - else + if (CntName == "vmcnt" || CntName == "vmcnt_sat") { + Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeVmcnt, decodeVmcnt); + } else if (CntName == "expcnt" || CntName == "expcnt_sat") { + Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeExpcnt, decodeExpcnt); + } else if (CntName == "lgkmcnt" || CntName == "lgkmcnt_sat") { + Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeLgkmcnt, decodeLgkmcnt); + } + + if (Failed) { + Error(ValLoc, "too large value for " + CntName); return true; + } + + if (getLexer().isNot(AsmToken::RParen)) { + return true; + } + + Parser.Lex(); + if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) { + const AsmToken NextToken = getLexer().peekTok(); + if (NextToken.is(AsmToken::Identifier)) { + Parser.Lex(); + } + } return false; } OperandMatchResultTy AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { - IsaVersion IV = getIsaVersion(getSTI().getFeatureBits()); - int64_t Waitcnt = getWaitcntBitMask(IV); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); + int64_t Waitcnt = getWaitcntBitMask(ISA); SMLoc S = Parser.getTok().getLoc(); switch(getLexer().getKind()) { @@ -2459,7 +3011,8 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { return MatchOperand_Success; } -bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width) { +bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, + int64_t &Width) { using namespace llvm::AMDGPU::Hwreg; if (Parser.getTok().getString() != "hwreg") @@ -2520,8 +3073,7 @@ bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, return false; } -OperandMatchResultTy -AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { +OperandMatchResultTy AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { using namespace llvm::AMDGPU::Hwreg; int64_t Imm16Val = 0; @@ -2892,6 +3444,298 @@ bool AMDGPUOperand::isSendMsg() const { } //===----------------------------------------------------------------------===// +// parser helpers +//===----------------------------------------------------------------------===// + +bool +AMDGPUAsmParser::trySkipId(const StringRef Id) { + if (getLexer().getKind() == AsmToken::Identifier && + Parser.getTok().getString() == Id) { + Parser.Lex(); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::trySkipToken(const AsmToken::TokenKind Kind) { + if (getLexer().getKind() == Kind) { + Parser.Lex(); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind, + const StringRef ErrMsg) { + if (!trySkipToken(Kind)) { + Error(Parser.getTok().getLoc(), ErrMsg); + return false; + } + return true; +} + +bool +AMDGPUAsmParser::parseExpr(int64_t &Imm) { + return !getParser().parseAbsoluteExpression(Imm); +} + +bool +AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) { + SMLoc S = Parser.getTok().getLoc(); + if (getLexer().getKind() == AsmToken::String) { + Val = Parser.getTok().getStringContents(); + Parser.Lex(); + return true; + } else { + Error(S, ErrMsg); + return false; + } +} + +//===----------------------------------------------------------------------===// +// swizzle +//===----------------------------------------------------------------------===// + +LLVM_READNONE +static unsigned +encodeBitmaskPerm(const unsigned AndMask, + const unsigned OrMask, + const unsigned XorMask) { + using namespace llvm::AMDGPU::Swizzle; + + return BITMASK_PERM_ENC | + (AndMask << BITMASK_AND_SHIFT) | + (OrMask << BITMASK_OR_SHIFT) | + (XorMask << BITMASK_XOR_SHIFT); +} + +bool +AMDGPUAsmParser::parseSwizzleOperands(const unsigned OpNum, int64_t* Op, + const unsigned MinVal, + const unsigned MaxVal, + const StringRef ErrMsg) { + for (unsigned i = 0; i < OpNum; ++i) { + if (!skipToken(AsmToken::Comma, "expected a comma")){ + return false; + } + SMLoc ExprLoc = Parser.getTok().getLoc(); + if (!parseExpr(Op[i])) { + return false; + } + if (Op[i] < MinVal || Op[i] > MaxVal) { + Error(ExprLoc, ErrMsg); + return false; + } + } + + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleQuadPerm(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + int64_t Lane[LANE_NUM]; + if (parseSwizzleOperands(LANE_NUM, Lane, 0, LANE_MAX, + "expected a 2-bit lane id")) { + Imm = QUAD_PERM_ENC; + for (auto i = 0; i < LANE_NUM; ++i) { + Imm |= Lane[i] << (LANE_SHIFT * i); + } + return true; + } + return false; +} + +bool +AMDGPUAsmParser::parseSwizzleBroadcast(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + SMLoc S = Parser.getTok().getLoc(); + int64_t GroupSize; + int64_t LaneIdx; + + if (!parseSwizzleOperands(1, &GroupSize, + 2, 32, + "group size must be in the interval [2,32]")) { + return false; + } + if (!isPowerOf2_64(GroupSize)) { + Error(S, "group size must be a power of two"); + return false; + } + if (parseSwizzleOperands(1, &LaneIdx, + 0, GroupSize - 1, + "lane id must be in the interval [0,group size - 1]")) { + Imm = encodeBitmaskPerm(BITMASK_MAX - GroupSize + 1, LaneIdx, 0); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::parseSwizzleReverse(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + SMLoc S = Parser.getTok().getLoc(); + int64_t GroupSize; + + if (!parseSwizzleOperands(1, &GroupSize, + 2, 32, "group size must be in the interval [2,32]")) { + return false; + } + if (!isPowerOf2_64(GroupSize)) { + Error(S, "group size must be a power of two"); + return false; + } + + Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize - 1); + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleSwap(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + SMLoc S = Parser.getTok().getLoc(); + int64_t GroupSize; + + if (!parseSwizzleOperands(1, &GroupSize, + 1, 16, "group size must be in the interval [1,16]")) { + return false; + } + if (!isPowerOf2_64(GroupSize)) { + Error(S, "group size must be a power of two"); + return false; + } + + Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize); + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + if (!skipToken(AsmToken::Comma, "expected a comma")) { + return false; + } + + StringRef Ctl; + SMLoc StrLoc = Parser.getTok().getLoc(); + if (!parseString(Ctl)) { + return false; + } + if (Ctl.size() != BITMASK_WIDTH) { + Error(StrLoc, "expected a 5-character mask"); + return false; + } + + unsigned AndMask = 0; + unsigned OrMask = 0; + unsigned XorMask = 0; + + for (size_t i = 0; i < Ctl.size(); ++i) { + unsigned Mask = 1 << (BITMASK_WIDTH - 1 - i); + switch(Ctl[i]) { + default: + Error(StrLoc, "invalid mask"); + return false; + case '0': + break; + case '1': + OrMask |= Mask; + break; + case 'p': + AndMask |= Mask; + break; + case 'i': + AndMask |= Mask; + XorMask |= Mask; + break; + } + } + + Imm = encodeBitmaskPerm(AndMask, OrMask, XorMask); + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleOffset(int64_t &Imm) { + + SMLoc OffsetLoc = Parser.getTok().getLoc(); + + if (!parseExpr(Imm)) { + return false; + } + if (!isUInt<16>(Imm)) { + Error(OffsetLoc, "expected a 16-bit offset"); + return false; + } + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + if (skipToken(AsmToken::LParen, "expected a left parentheses")) { + + SMLoc ModeLoc = Parser.getTok().getLoc(); + bool Ok = false; + + if (trySkipId(IdSymbolic[ID_QUAD_PERM])) { + Ok = parseSwizzleQuadPerm(Imm); + } else if (trySkipId(IdSymbolic[ID_BITMASK_PERM])) { + Ok = parseSwizzleBitmaskPerm(Imm); + } else if (trySkipId(IdSymbolic[ID_BROADCAST])) { + Ok = parseSwizzleBroadcast(Imm); + } else if (trySkipId(IdSymbolic[ID_SWAP])) { + Ok = parseSwizzleSwap(Imm); + } else if (trySkipId(IdSymbolic[ID_REVERSE])) { + Ok = parseSwizzleReverse(Imm); + } else { + Error(ModeLoc, "expected a swizzle mode"); + } + + return Ok && skipToken(AsmToken::RParen, "expected a closing parentheses"); + } + + return false; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + int64_t Imm = 0; + + if (trySkipId("offset")) { + + bool Ok = false; + if (skipToken(AsmToken::Colon, "expected a colon")) { + if (trySkipId("swizzle")) { + Ok = parseSwizzleMacro(Imm); + } else { + Ok = parseSwizzleOffset(Imm); + } + } + + Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTySwizzle)); + + return Ok? MatchOperand_Success : MatchOperand_ParseFail; + } else { + return MatchOperand_NoMatch; + } +} + +bool +AMDGPUOperand::isSwizzle() const { + return isImmTy(ImmTySwizzle); +} + +//===----------------------------------------------------------------------===// // sopp branch targets //===----------------------------------------------------------------------===// @@ -2980,52 +3824,60 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); } -//===----------------------------------------------------------------------===// -// mimg -//===----------------------------------------------------------------------===// - -void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) { - unsigned I = 1; - const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); - for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { - ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); - } - +void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; - for (unsigned E = Operands.size(); I != E; ++I) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); // Add the register arguments - if (Op.isRegOrImm()) { - Op.addRegOrImmOperands(Inst, 1); + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); continue; - } else if (Op.isImmModifier()) { - OptionalIdx[Op.getImmTy()] = I; - } else { - llvm_unreachable("unexpected operand type"); } + + // Handle the case where soffset is an immediate + if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { + Op.addImmOperands(Inst, 1); + continue; + } + + // Handle tokens like 'offen' which are sometimes hard-coded into the + // asm string. There are no MCInst operands for these. + if (Op.isToken()) { + continue; + } + assert(Op.isImm()); + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyOffset); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); } -void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) { +//===----------------------------------------------------------------------===// +// mimg +//===----------------------------------------------------------------------===// + +void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands, + bool IsAtomic) { unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } - // Add src, same as dst - ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1); + if (IsAtomic) { + // Add src, same as dst + ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1); + } OptionalImmIndexMap OptionalIdx; @@ -3053,6 +3905,10 @@ void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); } +void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) { + cvtMIMG(Inst, Operands, true); +} + AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDMask); } @@ -3103,6 +3959,14 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); } +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetU12() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); +} + //===----------------------------------------------------------------------===// // vop3 //===----------------------------------------------------------------------===// @@ -3152,6 +4016,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr}, {"gds", AMDGPUOperand::ImmTyGDS, true, nullptr}, {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, + {"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr}, + {"nfmt", AMDGPUOperand::ImmTyNFMT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, @@ -3169,7 +4035,12 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr}, {"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr}, {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr}, + {"compr", AMDGPUOperand::ImmTyExpCompr, true, nullptr }, {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr}, + {"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr}, + {"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr}, + {"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr}, + {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr} }; OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { @@ -3186,6 +4057,12 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operan res = parseSDWASel(Operands, Op.Name, Op.Type); } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstUnused) { res = parseSDWADstUnused(Operands); + } else if (Op.Type == AMDGPUOperand::ImmTyOpSel || + Op.Type == AMDGPUOperand::ImmTyOpSelHi || + Op.Type == AMDGPUOperand::ImmTyNegLo || + Op.Type == AMDGPUOperand::ImmTyNegHi) { + res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type, + Op.ConvertResult); } else { res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); } @@ -3211,25 +4088,6 @@ OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) return MatchOperand_NoMatch; } -void AMDGPUAsmParser::cvtId(MCInst &Inst, const OperandVector &Operands) { - unsigned I = 1; - const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); - for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { - ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); - } - for (unsigned E = Operands.size(); I != E; ++I) - ((AMDGPUOperand &)*Operands[I]).addRegOrImmOperands(Inst, 1); -} - -void AMDGPUAsmParser::cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands) { - uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; - if (TSFlags & SIInstrFlags::VOP3) { - cvtVOP3(Inst, Operands); - } else { - cvtId(Inst, Operands); - } -} - static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { // 1. This operand is input modifiers return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS @@ -3241,48 +4099,133 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { && Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1; } -void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { - OptionalImmIndexMap OptionalIdx; +void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, + OptionalImmIndexMap &OptionalIdx) { + unsigned Opc = Inst.getOpcode(); + unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } - for (unsigned E = Operands.size(); I != E; ++I) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { - Op.addRegOrImmWithFPInputModsOperands(Inst, 2); - } else if (Op.isImm()) { - OptionalIdx[Op.getImmTy()] = I; - } else { - llvm_unreachable("unhandled operand type"); + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) { + // This instruction has src modifiers + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + } else if (Op.isImmModifier()) { + OptionalIdx[Op.getImmTy()] = I; + } else if (Op.isRegOrImm()) { + Op.addRegOrImmOperands(Inst, 1); + } else { + llvm_unreachable("unhandled operand type"); + } + } + } else { + // No src modifiers + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + if (Op.isMod()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + Op.addRegOrImmOperands(Inst, 1); + } } } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); + } + + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); + } // special case v_mac_{f16, f32}: // it has src2 register operand that is tied to dst operand // we don't allow modifiers for this operand in assembler so src2_modifiers // should be 0 - if (Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_si || - Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || - Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi) { + if (Opc == AMDGPU::V_MAC_F32_e64_si || Opc == AMDGPU::V_MAC_F32_e64_vi || + Opc == AMDGPU::V_MAC_F16_e64_vi) { auto it = Inst.begin(); - std::advance( - it, - AMDGPU::getNamedOperandIdx(Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ? - AMDGPU::V_MAC_F16_e64 : - AMDGPU::V_MAC_F32_e64, - AMDGPU::OpName::src2_modifiers)); + std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers)); it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2 ++it; Inst.insert(it, Inst.getOperand(0)); // src2 = dst } } +void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + cvtVOP3(Inst, Operands, OptionalIdx); +} + +void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptIdx; + + cvtVOP3(Inst, Operands, OptIdx); + + // FIXME: This is messy. Parse the modifiers as if it was a normal VOP3 + // instruction, and then figure out where to actually put the modifiers + int Opc = Inst.getOpcode(); + + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel); + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSelHi, -1); + + int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo); + if (NegLoIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo); + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi); + } + + const int Ops[] = { AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + AMDGPU::OpName::src2 }; + const int ModOps[] = { AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers }; + + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); + + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); + unsigned NegLo = 0; + unsigned NegHi = 0; + + if (NegLoIdx != -1) { + int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi); + NegLo = Inst.getOperand(NegLoIdx).getImm(); + NegHi = Inst.getOperand(NegHiIdx).getImm(); + } + + for (int J = 0; J < 3; ++J) { + int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); + if (OpIdx == -1) + break; + + uint32_t ModVal = 0; + + if ((OpSel & (1 << J)) != 0) + ModVal |= SISrcMods::OP_SEL_0; + + if ((OpSelHi & (1 << J)) != 0) + ModVal |= SISrcMods::OP_SEL_1; + + if ((NegLo & (1 << J)) != 0) + ModVal |= SISrcMods::NEG; + + if ((NegHi & (1 << J)) != 0) + ModVal |= SISrcMods::NEG_HI; + + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + + Inst.getOperand(ModIdx).setImm(Inst.getOperand(ModIdx).getImm() | ModVal); + } +} + //===----------------------------------------------------------------------===// // dpp //===----------------------------------------------------------------------===// @@ -3311,6 +4254,14 @@ bool AMDGPUOperand::isGPRIdxMode() const { return isImm() && isUInt<4>(getImm()); } +bool AMDGPUOperand::isS16Imm() const { + return isImm() && (isInt<16>(getImm()) || isUInt<16>(getImm())); +} + +bool AMDGPUOperand::isU16Imm() const { + return isImm() && isUInt<16>(getImm()); +} + OperandMatchResultTy AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); @@ -3436,7 +4387,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { - // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token. + // VOP2b (v_add_u32, v_sub_u32 ...) dpp use "vcc" token. // Skip it. continue; } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { @@ -3541,13 +4492,19 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) { cvtSDWA(Inst, Operands, SIInstrFlags::VOP2); } +void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) { + cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true); +} + void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { - cvtSDWA(Inst, Operands, SIInstrFlags::VOPC); + cvtSDWA(Inst, Operands, SIInstrFlags::VOPC, isVI()); } void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, - uint64_t BasicInstType) { + uint64_t BasicInstType, bool skipVcc) { + using namespace llvm::AMDGPU::SDWA; OptionalImmIndexMap OptionalIdx; + bool skippedVcc = false; unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); @@ -3557,15 +4514,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - // Add the register arguments - if ((BasicInstType == SIInstrFlags::VOPC || - BasicInstType == SIInstrFlags::VOP2)&& - Op.isReg() && - Op.Reg.RegNo == AMDGPU::VCC) { - // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. - // Skip it. - continue; - } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { + // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. + // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3) + // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand. + // Skip VCC only if we didn't skip it on previous iteration. + if (BasicInstType == SIInstrFlags::VOP2 && + (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) { + skippedVcc = true; + continue; + } else if (BasicInstType == SIInstrFlags::VOPC && + Inst.getNumOperands() == 0) { + skippedVcc = true; + continue; + } + } + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegWithInputModsOperands(Inst, 2); } else if (Op.isImm()) { // Handle optional arguments @@ -3573,29 +4537,38 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, } else { llvm_unreachable("Invalid operand type"); } + skippedVcc = false; } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - - if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { - // V_NOP_sdwa_vi has no optional sdwa arguments + if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 && + Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { + // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments switch (BasicInstType) { case SIInstrFlags::VOP1: - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); + } + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); break; case SIInstrFlags::VOP2: - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); + } + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); break; case SIInstrFlags::VOPC: - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); break; default: @@ -3609,10 +4582,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) { auto it = Inst.begin(); std::advance( - it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); + it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); Inst.insert(it, Inst.getOperand(0)); // src2 = dst } - } /// Force static initialization. diff --git a/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td index 45a7fe6..2e96c14 100644 --- a/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -11,7 +11,9 @@ def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; -def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; +def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantRoot]>; +def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantRoot], 20>; + def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; @@ -21,8 +23,8 @@ def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset" class MubufLoad <SDPatternOperator op> : PatFrag < (ops node:$ptr), (op node:$ptr), [{ auto const AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUAS::GLOBAL_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS; + return AS == AMDGPUASI.GLOBAL_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS; }]>; def mubuf_load : MubufLoad <load>; @@ -55,6 +57,11 @@ class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { string OpName = NAME # suffix; } +class MTBUFAddr64Table <bit is_addr64, string suffix = ""> { + bit IsAddr64 = is_addr64; + string OpName = NAME # suffix; +} + //===----------------------------------------------------------------------===// // MTBUF classes //===----------------------------------------------------------------------===// @@ -76,14 +83,31 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, let EXP_CNT = 1; let MTBUF = 1; let Uses = [EXEC]; - let hasSideEffects = 0; let SchedRW = [WriteVMEM]; + + let AsmMatchConverter = "cvtMtbuf"; + + bits<1> offen = 0; + bits<1> idxen = 0; + bits<1> addr64 = 0; + bits<1> has_vdata = 1; + bits<1> has_vaddr = 1; + bits<1> has_glc = 1; + bits<1> glc_value = 0; // the value for glc if no such operand + bits<4> dfmt_value = 1; // the value for dfmt if no such operand + bits<3> nfmt_value = 0; // the value for nfmt if no such operand + bits<1> has_srsrc = 1; + bits<1> has_soffset = 1; + bits<1> has_offset = 1; + bits<1> has_slc = 1; + bits<1> has_tfe = 1; + bits<1> has_dfmt = 1; + bits<1> has_nfmt = 1; } class MTBUF_Real <MTBUF_Pseudo ps> : - InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, - Enc64 { + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> { let isPseudo = 0; let isCodeGenOnly = 0; @@ -95,57 +119,168 @@ class MTBUF_Real <MTBUF_Pseudo ps> : let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; - bits<8> vdata; bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<1> addr64; - bits<4> dfmt; - bits<3> nfmt; - bits<8> vaddr; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{22-19} = dfmt; - let Inst{25-23} = nfmt; - let Inst{31-26} = 0x3a; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{54} = slc; - let Inst{55} = tfe; - let Inst{63-56} = soffset; + bits<1> glc; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; +} + +class getMTBUFInsDA<list<RegisterClass> vdataList, + list<RegisterClass> vaddrList=[]> { + RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); + RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + dag InsNoData = !if(!empty(vaddrList), + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, + offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe), + (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, + offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe) + ); + dag InsData = !if(!empty(vaddrList), + (ins vdataClass:$vdata, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, + slc:$slc, tfe:$tfe), + (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, + slc:$slc, tfe:$tfe) + ); + dag ret = !if(!empty(vdataList), InsNoData, InsData); +} + +class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> { + dag ret = + !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList>.ret, + !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPR_32]>.ret, + !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA<vdataList, [VGPR_32]>.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64]>.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64]>.ret, + (ins)))))); +} + +class getMTBUFAsmOps<int addrKind> { + string Pfx = + !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset", + !if(!eq(addrKind, BUFAddrKind.OffEn), + "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen", + !if(!eq(addrKind, BUFAddrKind.IdxEn), + "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen", + !if(!eq(addrKind, BUFAddrKind.BothEn), + "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen", + !if(!eq(addrKind, BUFAddrKind.Addr64), + "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64", + ""))))); + string ret = Pfx # "$offset"; +} + +class MTBUF_SetupAddr<int addrKind> { + bits<1> offen = !if(!eq(addrKind, BUFAddrKind.OffEn), 1, + !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0)); + + bits<1> idxen = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1, + !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0)); + + bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0); + + bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1); } -class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo < - opName, (outs regClass:$dst), - (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, - i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc, - i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset), - " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"# - " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> { +class MTBUF_Load_Pseudo <string opName, + int addrKind, + RegisterClass vdataClass, + list<dag> pattern=[], + // Workaround bug bz30254 + int addrKindCopy = addrKind> + : MTBUF_Pseudo<opName, + (outs vdataClass:$vdata), + getMTBUFIns<addrKindCopy>.ret, + " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe", + pattern>, + MTBUF_SetupAddr<addrKindCopy> { + let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let mayLoad = 1; let mayStore = 0; } -class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo < - opName, (outs), - (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, - i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, - SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset), - " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"# - " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> { +multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, + ValueType load_vt = i32, + SDPatternOperator ld = null_frag> { + + def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + [(set load_vt:$vdata, + (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt, + i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>, + MTBUFAddr64Table<0>; + + def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + [(set load_vt:$vdata, + (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, + i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>, + MTBUFAddr64Table<1>; + + def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + + let DisableWQM = 1 in { + def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; + def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + } +} + +class MTBUF_Store_Pseudo <string opName, + int addrKind, + RegisterClass vdataClass, + list<dag> pattern=[], + // Workaround bug bz30254 + int addrKindCopy = addrKind, + RegisterClass vdataClassCopy = vdataClass> + : MTBUF_Pseudo<opName, + (outs), + getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret, + " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe", + pattern>, + MTBUF_SetupAddr<addrKindCopy> { + let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let mayLoad = 0; let mayStore = 1; } +multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, + ValueType store_vt = i32, + SDPatternOperator st = null_frag> { + + def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, + i1:$slc, i1:$tfe))]>, + MTBUFAddr64Table<0>; + + def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, + i1:$slc, i1:$tfe))]>, + MTBUFAddr64Table<1>; + + def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + + let DisableWQM = 1 in { + def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; + def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + } +} + + //===----------------------------------------------------------------------===// // MUBUF classes //===----------------------------------------------------------------------===// @@ -674,14 +809,14 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", // MTBUF Instructions //===----------------------------------------------------------------------===// -//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0, "tbuffer_load_format_x", []>; -//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <1, "tbuffer_load_format_xy", []>; -//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <2, "tbuffer_load_format_xyz", []>; -def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Pseudo <"tbuffer_load_format_xyzw", VReg_128>; -def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>; -def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>; -def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>; -def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; } // End let SubtargetPredicate = isGCN @@ -705,12 +840,6 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol", let Predicates = [isGCN] in { -// int_SI_vs_load_input -def : Pat< - (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), - (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0) ->; - // Offset in an 32-bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), @@ -964,21 +1093,30 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>; } // End Predicates = [Has16BitInsts] -class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat < - (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))), - (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) ->; +multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen, + MUBUF_Pseudo InstrOffset, + ValueType vt, PatFrag ld> { + def : Pat < + (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))), + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) + >; + + def : Pat < + (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0) + >; +} -def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i16, sextloadi8_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i16, extloadi8_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, i32, sextloadi16_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>; // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword <ValueType vt, @@ -1060,40 +1198,126 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>; defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>; -class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat < - (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, - u16imm:$offset)), - (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) ->; +multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen, + MUBUF_Pseudo InstrOffset, + ValueType vt, PatFrag st> { + def : Pat < + (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset)), + (InstrOffen $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) + >; + + def : Pat < + (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, + u16imm:$offset)), + (InstrOffset $value, $srsrc, $soffset, $offset, 0, 0, 0) + >; +} -def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i16, truncstorei8_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i16, store_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i32, truncstorei8_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>; //===----------------------------------------------------------------------===// // MTBUF Patterns //===----------------------------------------------------------------------===// -// TBUFFER_STORE_FORMAT_*, addr64=0 -class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat< - (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, - i32:$soffset, imm:$inst_offset, imm:$dfmt, - imm:$nfmt, imm:$offen, imm:$idxen, - imm:$glc, imm:$slc, imm:$tfe), - (opcode - $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen), - (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc, - (as_i1imm $slc), (as_i1imm $tfe), $soffset) ->; +//===----------------------------------------------------------------------===// +// tbuffer_load/store_format patterns +//===----------------------------------------------------------------------===// + +multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, + string opcode> { + def : Pat< + (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + (!cast<MTBUF_Pseudo>(opcode # _BOTHEN) + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32, "TBUFFER_LOAD_FORMAT_X">; +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">; +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">; +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32, "TBUFFER_LOAD_FORMAT_X">; +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">; +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">; + +multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, + string opcode> { + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, + (as_i16imm $offset), (as_i8imm $dfmt), + (as_i8imm $nfmt), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (as_i8imm $dfmt), + (as_i8imm $nfmt), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (as_i8imm $dfmt), + (as_i8imm $nfmt), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, + imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact) + $vdata, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} -def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>; -def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>; -def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>; -def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32, "TBUFFER_STORE_FORMAT_X">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32, "TBUFFER_STORE_FORMAT_X">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">; } // End let Predicates = [isGCN] @@ -1209,21 +1433,44 @@ def BUFFER_WBINVL1_si : MUBUF_Real_si <0x71, BUFFER_WBINVL1>; class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> : MTBUF_Real<ps>, + Enc64, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> { let AssemblerPredicate=isSICI; let DecoderNamespace="SICI"; - bits<1> addr64; - let Inst{15} = addr64; + let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{15} = ps.addr64; let Inst{18-16} = op; + let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); + let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{55} = !if(ps.has_tfe, tfe, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } -def TBUFFER_LOAD_FORMAT_XYZW_si : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>; -def TBUFFER_STORE_FORMAT_X_si : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>; -def TBUFFER_STORE_FORMAT_XY_si : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>; -def TBUFFER_STORE_FORMAT_XYZ_si : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>; -def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>; +multiclass MTBUF_Real_AllAddr_si<bits<3> op> { + def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>; + def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>; + def _OFFEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>; + def _IDXEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>; + def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; +} +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_si <0>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_si <1>; +//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_si <3>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_si <4>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_si <5>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_si <6>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>; //===----------------------------------------------------------------------===// // CI @@ -1335,16 +1582,39 @@ def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> : MTBUF_Real<ps>, + Enc64, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> { let AssemblerPredicate=isVI; let DecoderNamespace="VI"; + let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); let Inst{18-15} = op; + let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); + let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{55} = !if(ps.has_tfe, tfe, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } -def TBUFFER_LOAD_FORMAT_XYZW_vi : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>; -def TBUFFER_STORE_FORMAT_X_vi : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>; -def TBUFFER_STORE_FORMAT_XY_vi : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>; -def TBUFFER_STORE_FORMAT_XYZ_vi : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>; -def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>; +multiclass MTBUF_Real_AllAddr_vi<bits<4> op> { + def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>; + def _OFFEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>; + def _IDXEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>; + def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; +} +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <1>; +//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <2>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <3>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <4>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>; diff --git a/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td index a077001..fc516c3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -88,18 +88,6 @@ class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32> let has_vdst = 0; } -class DS_1A_Off8_NORET<string opName> : DS_Pseudo<opName, - (outs), - (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds), - "$addr $offset0$offset1$gds"> { - - let has_data0 = 0; - let has_data1 = 0; - let has_vdst = 0; - let has_offset = 0; - let AsmMatchConverter = "cvtDSOffset01"; -} - class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs), @@ -143,10 +131,24 @@ class DS_1A2D_RET<string opName, let hasPostISelHook = 1; } -class DS_1A_RET<string opName, RegisterClass rc = VGPR_32> +class DS_1A2D_Off8_RET<string opName, + RegisterClass rc = VGPR_32, + RegisterClass src = rc> : DS_Pseudo<opName, (outs rc:$vdst), - (ins VGPR_32:$addr, offset:$offset, gds:$gds), + (ins VGPR_32:$addr, src:$data0, src:$data1, offset0:$offset0, offset1:$offset1, gds:$gds), + "$vdst, $addr, $data0, $data1$offset0$offset1$gds"> { + + let has_offset = 0; + let AsmMatchConverter = "cvtDSOffset01"; + + let hasPostISelHook = 1; +} + +class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, Operand ofs = offset> +: DS_Pseudo<opName, + (outs rc:$vdst), + (ins VGPR_32:$addr, ofs:$offset, gds:$gds), "$vdst, $addr$offset$gds"> { let has_data0 = 0; @@ -174,6 +176,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName, let has_data1 = 0; let has_gds = 0; let gdsValue = 1; + let AsmMatchConverter = "cvtDSGds"; } class DS_0A_RET <string opName> : DS_Pseudo<opName, @@ -202,20 +205,46 @@ class DS_1A <string opName> : DS_Pseudo<opName, let has_data1 = 0; } -class DS_1A_GDS <string opName> : DS_Pseudo<opName, - (outs), - (ins VGPR_32:$addr), - "$addr gds"> { +class DS_GWS <string opName, dag ins, string asmOps> +: DS_Pseudo<opName, (outs), ins, asmOps> { + + let has_vdst = 0; + let has_addr = 0; + let has_data0 = 0; + let has_data1 = 0; + + let has_gds = 0; + let gdsValue = 1; + let AsmMatchConverter = "cvtDSGds"; +} - let has_vdst = 0; - let has_data0 = 0; - let has_data1 = 0; - let has_offset = 0; +class DS_GWS_0D <string opName> +: DS_GWS<opName, + (ins offset:$offset, gds:$gds), "$offset gds">; + +class DS_GWS_1D <string opName> +: DS_GWS<opName, + (ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> { + + let has_data0 = 1; +} + +class DS_VOID <string opName> : DS_Pseudo<opName, + (outs), (ins), ""> { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 1; + let UseNamedOperandTable = 0; + let AsmMatchConverter = ""; + + let has_vdst = 0; + let has_addr = 0; + let has_data0 = 0; + let has_data1 = 0; + let has_offset = 0; let has_offset0 = 0; let has_offset1 = 0; - - let has_gds = 0; - let gdsValue = 1; + let has_gds = 0; } class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag> @@ -324,9 +353,9 @@ def DS_MAX_RTN_F32 : DS_1A1D_RET <"ds_max_rtn_f32">, def DS_WRXCHG_RTN_B32 : DS_1A1D_RET<"ds_wrxchg_rtn_b32">, AtomicNoRet<"", 1>; -def DS_WRXCHG2_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>, +def DS_WRXCHG2_RTN_B32 : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>, AtomicNoRet<"", 1>; -def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>, +def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>, AtomicNoRet<"", 1>; def DS_ADD_RTN_U64 : DS_1A1D_RET<"ds_add_rtn_u64", VReg_64>, @@ -365,17 +394,17 @@ def DS_MAX_RTN_F64 : DS_1A1D_RET<"ds_max_rtn_f64", VReg_64>, AtomicNoRet<"ds_max_f64", 1>; def DS_WRXCHG_RTN_B64 : DS_1A1D_RET<"ds_wrxchg_rtn_b64", VReg_64>, - AtomicNoRet<"ds_wrxchg_b64", 1>; -def DS_WRXCHG2_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>, - AtomicNoRet<"ds_wrxchg2_b64", 1>; -def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>, - AtomicNoRet<"ds_wrxchg2st64_b64", 1>; - -def DS_GWS_INIT : DS_1A_GDS<"ds_gws_init">; -def DS_GWS_SEMA_V : DS_1A_GDS<"ds_gws_sema_v">; -def DS_GWS_SEMA_BR : DS_1A_GDS<"ds_gws_sema_br">; -def DS_GWS_SEMA_P : DS_1A_GDS<"ds_gws_sema_p">; -def DS_GWS_BARRIER : DS_1A_GDS<"ds_gws_barrier">; + AtomicNoRet<"", 1>; +def DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>, + AtomicNoRet<"", 1>; +def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>, + AtomicNoRet<"", 1>; + +def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">; +def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">; +def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">; +def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">; +def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">; def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">; def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">; @@ -386,7 +415,7 @@ def DS_MIN_SRC2_I32 : DS_1A<"ds_min_src2_i32">; def DS_MAX_SRC2_I32 : DS_1A<"ds_max_src2_i32">; def DS_MIN_SRC2_U32 : DS_1A<"ds_min_src2_u32">; def DS_MAX_SRC2_U32 : DS_1A<"ds_max_src2_u32">; -def DS_AND_SRC2_B32 : DS_1A<"ds_and_src_b32">; +def DS_AND_SRC2_B32 : DS_1A<"ds_and_src2_b32">; def DS_OR_SRC2_B32 : DS_1A<"ds_or_src2_b32">; def DS_XOR_SRC2_B32 : DS_1A<"ds_xor_src2_b32">; def DS_MIN_SRC2_F32 : DS_1A<"ds_min_src2_f32">; @@ -407,11 +436,11 @@ def DS_XOR_SRC2_B64 : DS_1A<"ds_xor_src2_b64">; def DS_MIN_SRC2_F64 : DS_1A<"ds_min_src2_f64">; def DS_MAX_SRC2_F64 : DS_1A<"ds_max_src2_f64">; -def DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET<"ds_write_src2_b32">; -def DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET<"ds_write_src2_b64">; +def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">; +def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">; let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in { -def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">; +def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>; } let mayStore = 0 in { @@ -429,30 +458,34 @@ def DS_READ2_B64 : DS_1A_Off8_RET<"ds_read2_b64", VReg_128>; def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>; } -let SubtargetPredicate = isSICI in { def DS_CONSUME : DS_0A_RET<"ds_consume">; def DS_APPEND : DS_0A_RET<"ds_append">; def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">; -} //===----------------------------------------------------------------------===// // Instruction definitions for CI and newer. //===----------------------------------------------------------------------===// -// Remaining instructions: -// DS_NOP -// DS_GWS_SEMA_RELEASE_ALL -// DS_WRAP_RTN_B32 -// DS_CNDXCHG32_RTN_B64 -// DS_WRITE_B96 -// DS_WRITE_B128 -// DS_CONDXCHG32_RTN_B128 -// DS_READ_B96 -// DS_READ_B128 let SubtargetPredicate = isCIVI in { -def DS_WRAP_RTN_F32 : DS_1A1D_RET <"ds_wrap_rtn_f32">, - AtomicNoRet<"ds_wrap_f32", 1>; +def DS_WRAP_RTN_B32 : DS_1A2D_RET<"ds_wrap_rtn_b32">, AtomicNoRet<"", 1>; + +def DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET<"ds_condxchg32_rtn_b64", VReg_64>, + AtomicNoRet<"", 1>; + +def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">; + +let mayStore = 0 in { +def DS_READ_B96 : DS_1A_RET<"ds_read_b96", VReg_96>; +def DS_READ_B128: DS_1A_RET<"ds_read_b128", VReg_128>; +} // End mayStore = 0 + +let mayLoad = 0 in { +def DS_WRITE_B96 : DS_1A1D_NORET<"ds_write_b96", VReg_96>; +def DS_WRITE_B128 : DS_1A1D_NORET<"ds_write_b128", VReg_128>; +} // End mayLoad = 0 + +def DS_NOP : DS_VOID<"ds_nop">; } // let SubtargetPredicate = isCIVI @@ -623,6 +656,7 @@ def DS_CMPST_B32_si : DS_Real_si<0x10, DS_CMPST_B32>; def DS_CMPST_F32_si : DS_Real_si<0x11, DS_CMPST_F32>; def DS_MIN_F32_si : DS_Real_si<0x12, DS_MIN_F32>; def DS_MAX_F32_si : DS_Real_si<0x13, DS_MAX_F32>; +def DS_NOP_si : DS_Real_si<0x14, DS_NOP>; def DS_GWS_INIT_si : DS_Real_si<0x19, DS_GWS_INIT>; def DS_GWS_SEMA_V_si : DS_Real_si<0x1a, DS_GWS_SEMA_V>; def DS_GWS_SEMA_BR_si : DS_Real_si<0x1b, DS_GWS_SEMA_BR>; @@ -651,8 +685,10 @@ def DS_CMPST_RTN_F32_si : DS_Real_si<0x31, DS_CMPST_RTN_F32>; def DS_MIN_RTN_F32_si : DS_Real_si<0x32, DS_MIN_RTN_F32>; def DS_MAX_RTN_F32_si : DS_Real_si<0x33, DS_MAX_RTN_F32>; -// FIXME: this instruction is actually CI/VI -def DS_WRAP_RTN_F32_si : DS_Real_si<0x34, DS_WRAP_RTN_F32>; +// These instruction are CI/VI only +def DS_WRAP_RTN_B32_si : DS_Real_si<0x34, DS_WRAP_RTN_B32>; +def DS_CONDXCHG32_RTN_B64_si : DS_Real_si<0x7e, DS_CONDXCHG32_RTN_B64>; +def DS_GWS_SEMA_RELEASE_ALL_si : DS_Real_si<0x18, DS_GWS_SEMA_RELEASE_ALL>; def DS_SWIZZLE_B32_si : DS_Real_si<0x35, DS_SWIZZLE_B32>; def DS_READ_B32_si : DS_Real_si<0x36, DS_READ_B32>; @@ -744,6 +780,10 @@ def DS_WRITE_SRC2_B64_si : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>; def DS_MIN_SRC2_F64_si : DS_Real_si<0xd2, DS_MIN_SRC2_F64>; def DS_MAX_SRC2_F64_si : DS_Real_si<0xd3, DS_MAX_SRC2_F64>; +def DS_WRITE_B96_si : DS_Real_si<0xde, DS_WRITE_B96>; +def DS_WRITE_B128_si : DS_Real_si<0xdf, DS_WRITE_B128>; +def DS_READ_B96_si : DS_Real_si<0xfe, DS_READ_B96>; +def DS_READ_B128_si : DS_Real_si<0xff, DS_READ_B128>; //===----------------------------------------------------------------------===// // VIInstructions.td @@ -787,12 +827,13 @@ def DS_CMPST_B32_vi : DS_Real_vi<0x10, DS_CMPST_B32>; def DS_CMPST_F32_vi : DS_Real_vi<0x11, DS_CMPST_F32>; def DS_MIN_F32_vi : DS_Real_vi<0x12, DS_MIN_F32>; def DS_MAX_F32_vi : DS_Real_vi<0x13, DS_MAX_F32>; +def DS_NOP_vi : DS_Real_vi<0x14, DS_NOP>; def DS_ADD_F32_vi : DS_Real_vi<0x15, DS_ADD_F32>; -def DS_GWS_INIT_vi : DS_Real_vi<0x19, DS_GWS_INIT>; -def DS_GWS_SEMA_V_vi : DS_Real_vi<0x1a, DS_GWS_SEMA_V>; -def DS_GWS_SEMA_BR_vi : DS_Real_vi<0x1b, DS_GWS_SEMA_BR>; -def DS_GWS_SEMA_P_vi : DS_Real_vi<0x1c, DS_GWS_SEMA_P>; -def DS_GWS_BARRIER_vi : DS_Real_vi<0x1d, DS_GWS_BARRIER>; +def DS_GWS_INIT_vi : DS_Real_vi<0x99, DS_GWS_INIT>; +def DS_GWS_SEMA_V_vi : DS_Real_vi<0x9a, DS_GWS_SEMA_V>; +def DS_GWS_SEMA_BR_vi : DS_Real_vi<0x9b, DS_GWS_SEMA_BR>; +def DS_GWS_SEMA_P_vi : DS_Real_vi<0x9c, DS_GWS_SEMA_P>; +def DS_GWS_BARRIER_vi : DS_Real_vi<0x9d, DS_GWS_BARRIER>; def DS_WRITE_B8_vi : DS_Real_vi<0x1e, DS_WRITE_B8>; def DS_WRITE_B16_vi : DS_Real_vi<0x1f, DS_WRITE_B16>; def DS_ADD_RTN_U32_vi : DS_Real_vi<0x20, DS_ADD_RTN_U32>; @@ -815,7 +856,7 @@ def DS_CMPST_RTN_B32_vi : DS_Real_vi<0x30, DS_CMPST_RTN_B32>; def DS_CMPST_RTN_F32_vi : DS_Real_vi<0x31, DS_CMPST_RTN_F32>; def DS_MIN_RTN_F32_vi : DS_Real_vi<0x32, DS_MIN_RTN_F32>; def DS_MAX_RTN_F32_vi : DS_Real_vi<0x33, DS_MAX_RTN_F32>; -def DS_WRAP_RTN_F32_vi : DS_Real_vi<0x34, DS_WRAP_RTN_F32>; +def DS_WRAP_RTN_B32_vi : DS_Real_vi<0x34, DS_WRAP_RTN_B32>; def DS_ADD_RTN_F32_vi : DS_Real_vi<0x35, DS_ADD_RTN_F32>; def DS_READ_B32_vi : DS_Real_vi<0x36, DS_READ_B32>; def DS_READ2_B32_vi : DS_Real_vi<0x37, DS_READ2_B32>; @@ -824,6 +865,9 @@ def DS_READ_I8_vi : DS_Real_vi<0x39, DS_READ_I8>; def DS_READ_U8_vi : DS_Real_vi<0x3a, DS_READ_U8>; def DS_READ_I16_vi : DS_Real_vi<0x3b, DS_READ_I16>; def DS_READ_U16_vi : DS_Real_vi<0x3c, DS_READ_U16>; +def DS_CONSUME_vi : DS_Real_vi<0xbd, DS_CONSUME>; +def DS_APPEND_vi : DS_Real_vi<0xbe, DS_APPEND>; +def DS_ORDERED_COUNT_vi : DS_Real_vi<0xbf, DS_ORDERED_COUNT>; def DS_SWIZZLE_B32_vi : DS_Real_vi<0x3d, DS_SWIZZLE_B32>; def DS_PERMUTE_B32_vi : DS_Real_vi<0x3e, DS_PERMUTE_B32>; def DS_BPERMUTE_B32_vi : DS_Real_vi<0x3f, DS_BPERMUTE_B32>; @@ -865,6 +909,8 @@ def DS_MSKOR_RTN_B64_vi : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>; def DS_WRXCHG_RTN_B64_vi : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>; def DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>; def DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>; +def DS_CONDXCHG32_RTN_B64_vi : DS_Real_vi<0x7e, DS_CONDXCHG32_RTN_B64>; +def DS_GWS_SEMA_RELEASE_ALL_vi : DS_Real_vi<0x98, DS_GWS_SEMA_RELEASE_ALL>; def DS_CMPST_RTN_B64_vi : DS_Real_vi<0x70, DS_CMPST_RTN_B64>; def DS_CMPST_RTN_F64_vi : DS_Real_vi<0x71, DS_CMPST_RTN_F64>; def DS_MIN_RTN_F64_vi : DS_Real_vi<0x72, DS_MIN_RTN_F64>; @@ -904,3 +950,7 @@ def DS_XOR_SRC2_B64_vi : DS_Real_vi<0xcb, DS_XOR_SRC2_B64>; def DS_WRITE_SRC2_B64_vi : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>; def DS_MIN_SRC2_F64_vi : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>; def DS_MAX_SRC2_F64_vi : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>; +def DS_WRITE_B96_vi : DS_Real_vi<0xde, DS_WRITE_B96>; +def DS_WRITE_B128_vi : DS_Real_vi<0xdf, DS_WRITE_B128>; +def DS_READ_B96_vi : DS_Real_vi<0xfe, DS_READ_B96>; +def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>; diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 2247cad..966c6fe 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -20,20 +20,20 @@ #include "AMDGPUDisassembler.h" #include "AMDGPU.h" #include "AMDGPURegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/ELF.h" -#include "llvm/Support/Endian.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/TargetRegistry.h" - using namespace llvm; #define DEBUG_TYPE "amdgpu-disassembler" @@ -49,6 +49,17 @@ addOperand(MCInst &Inst, const MCOperand& Opnd) { MCDisassembler::SoftFail; } +static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op, + uint16_t NameIdx) { + int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx); + if (OpIdx != -1) { + auto I = MI.begin(); + std::advance(I, OpIdx); + MI.insert(I, Op); + } + return OpIdx; +} + static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); @@ -61,32 +72,34 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, return addOperand(Inst, MCOperand::createImm(Imm)); } -#define DECODE_OPERAND2(RegClass, DecName) \ -static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \ - unsigned Imm, \ - uint64_t /*Addr*/, \ - const void *Decoder) { \ +#define DECODE_OPERAND(StaticDecoderName, DecoderName) \ +static DecodeStatus StaticDecoderName(MCInst &Inst, \ + unsigned Imm, \ + uint64_t /*Addr*/, \ + const void *Decoder) { \ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \ - return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \ + return addOperand(Inst, DAsm->DecoderName(Imm)); \ } -#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass) +#define DECODE_OPERAND_REG(RegClass) \ +DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass) -DECODE_OPERAND(VGPR_32) -DECODE_OPERAND(VS_32) -DECODE_OPERAND(VS_64) +DECODE_OPERAND_REG(VGPR_32) +DECODE_OPERAND_REG(VS_32) +DECODE_OPERAND_REG(VS_64) +DECODE_OPERAND_REG(VS_128) -DECODE_OPERAND(VReg_64) -DECODE_OPERAND(VReg_96) -DECODE_OPERAND(VReg_128) +DECODE_OPERAND_REG(VReg_64) +DECODE_OPERAND_REG(VReg_96) +DECODE_OPERAND_REG(VReg_128) -DECODE_OPERAND(SReg_32) -DECODE_OPERAND(SReg_32_XM0_XEXEC) -DECODE_OPERAND(SReg_64) -DECODE_OPERAND(SReg_64_XEXEC) -DECODE_OPERAND(SReg_128) -DECODE_OPERAND(SReg_256) -DECODE_OPERAND(SReg_512) +DECODE_OPERAND_REG(SReg_32) +DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) +DECODE_OPERAND_REG(SReg_64) +DECODE_OPERAND_REG(SReg_64_XEXEC) +DECODE_OPERAND_REG(SReg_128) +DECODE_OPERAND_REG(SReg_256) +DECODE_OPERAND_REG(SReg_512) static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, @@ -97,9 +110,20 @@ static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm)); } -#define GET_SUBTARGETINFO_ENUM -#include "AMDGPUGenSubtargetInfo.inc" -#undef GET_SUBTARGETINFO_ENUM +static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); +} + +#define DECODE_SDWA(DecName) \ +DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName) + +DECODE_SDWA(Src32) +DECODE_SDWA(Src16) +DECODE_SDWA(VopcDst) #include "AMDGPUGenDisassemblerTables.inc" @@ -121,6 +145,7 @@ DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table, assert(MI.getOpcode() == 0); assert(MI.getNumOperands() == 0); MCInst TmpInst; + HasLiteral = false; const auto SavedBytes = Bytes; if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) { MI = TmpInst; @@ -136,9 +161,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, raw_ostream &WS, raw_ostream &CS) const { CommentStream = &CS; + bool IsSDWA = false; // ToDo: AMDGPUDisassembler supports only VI ISA. - assert(AMDGPU::isVI(STI) && "Can disassemble only VI ISA."); + if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]) + report_fatal_error("Disassembly not yet supported for subtarget"); const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size()); Bytes = Bytes_.slice(0, MaxInstBytesNum); @@ -156,7 +183,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Res) break; Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address); - if (Res) break; + if (Res) { IsSDWA = true; break; } + + Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address); + if (Res) { IsSDWA = true; break; } } // Reinitialize Bytes as DPP64 could have eaten too much @@ -179,10 +209,40 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address); } while (false); + if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || + MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si || + MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) { + // Insert dummy unused src2_modifiers. + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src2_modifiers); + } + + if (Res && IsSDWA) + Res = convertSDWAInst(MI); + Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0; return Res; } +DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { + if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { + if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1) + // VOPC - insert clamp + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp); + } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) { + int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst); + if (SDst != -1) { + // VOPC - insert VCC register as sdst + insertNamedMCOperand(MI, MCOperand::createReg(AMDGPU::VCC), + AMDGPU::OpName::sdst); + } else { + // VOP1/2 - insert omod if present in instruction + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod); + } + } + return MCDisassembler::Success; +} + const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const { return getContext().getRegisterInfo()-> getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]); @@ -259,10 +319,18 @@ MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const { return decodeSrcOp(OPW64, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VS_128(unsigned Val) const { + return decodeSrcOp(OPW128, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const { return decodeSrcOp(OPW16, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VSrcV216(unsigned Val) const { + return decodeSrcOp(OPWV216, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { // Some instructions have operand restrictions beyond what the encoding // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra @@ -322,10 +390,15 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant() const { // For now all literal constants are supposed to be unsigned integer // ToDo: deal with signed/unsigned 64-bit integer constants // ToDo: deal with float/double constants - if (Bytes.size() < 4) - return errOperand(0, "cannot read literal, inst bytes left " + - Twine(Bytes.size())); - return MCOperand::createImm(eatBytes<uint32_t>(Bytes)); + if (!HasLiteral) { + if (Bytes.size() < 4) { + return errOperand(0, "cannot read literal, inst bytes left " + + Twine(Bytes.size())); + } + HasLiteral = true; + Literal = eatBytes<uint32_t>(Bytes); + } + return MCOperand::createImm(Literal); } MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) { @@ -423,6 +496,7 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) { case OPW64: return MCOperand::createImm(getInlineImmVal64(Imm)); case OPW16: + case OPWV216: return MCOperand::createImm(getInlineImmVal16(Imm)); default: llvm_unreachable("implement me"); @@ -436,6 +510,7 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { default: // fall case OPW32: case OPW16: + case OPWV216: return VGPR_32RegClassID; case OPW64: return VReg_64RegClassID; case OPW128: return VReg_128RegClassID; @@ -449,6 +524,7 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { default: // fall case OPW32: case OPW16: + case OPWV216: return SGPR_32RegClassID; case OPW64: return SGPR_64RegClassID; case OPW128: return SGPR_128RegClassID; @@ -462,6 +538,7 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { default: // fall case OPW32: case OPW16: + case OPWV216: return TTMP_32RegClassID; case OPW64: return TTMP_64RegClassID; case OPW128: return TTMP_128RegClassID; @@ -483,8 +560,6 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN); } - assert(Width == OPW16 || Width == OPW32 || Width == OPW64); - if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX) return decodeIntImmed(Val); @@ -497,6 +572,7 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c switch (Width) { case OPW32: case OPW16: + case OPWV216: return decodeSpecialReg32(Val); case OPW64: return decodeSpecialReg64(Val); @@ -522,6 +598,11 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { case 124: return createRegOperand(M0); case 126: return createRegOperand(EXEC_LO); case 127: return createRegOperand(EXEC_HI); + case 235: return createRegOperand(SRC_SHARED_BASE); + case 236: return createRegOperand(SRC_SHARED_LIMIT); + case 237: return createRegOperand(SRC_PRIVATE_BASE); + case 238: return createRegOperand(SRC_PRIVATE_LIMIT); + // TODO: SRC_POPS_EXITING_WAVE_ID // ToDo: no support for vccz register case 251: break; // ToDo: no support for execz register @@ -545,6 +626,57 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { return errOperand(Val, "unknown operand encoding " + Twine(Val)); } +MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, + unsigned Val) const { + using namespace AMDGPU::SDWA; + + if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { + // XXX: static_cast<int> is needed to avoid stupid warning: + // compare with unsigned is always true + if (SDWA9EncValues::SRC_VGPR_MIN <= static_cast<int>(Val) && + Val <= SDWA9EncValues::SRC_VGPR_MAX) { + return createRegOperand(getVgprClassId(Width), + Val - SDWA9EncValues::SRC_VGPR_MIN); + } + if (SDWA9EncValues::SRC_SGPR_MIN <= Val && + Val <= SDWA9EncValues::SRC_SGPR_MAX) { + return createSRegOperand(getSgprClassId(Width), + Val - SDWA9EncValues::SRC_SGPR_MIN); + } + + return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN); + } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) { + return createRegOperand(getVgprClassId(Width), Val); + } + llvm_unreachable("unsupported target"); +} + +MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const { + return decodeSDWASrc(OPW16, Val); +} + +MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const { + return decodeSDWASrc(OPW32, Val); +} + + +MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { + using namespace AMDGPU::SDWA; + + assert(STI.getFeatureBits()[AMDGPU::FeatureGFX9] && + "SDWAVopcDst should be present only on GFX9"); + if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) { + Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK; + if (Val > AMDGPU::EncValues::SGPR_MAX) { + return decodeSpecialReg64(Val); + } else { + return createSRegOperand(getSgprClassId(OPW64), Val); + } + } else { + return createRegOperand(AMDGPU::VCC); + } +} + //===----------------------------------------------------------------------===// // AMDGPUSymbolizer //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index ee5883a..4c755be 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -20,8 +20,8 @@ #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCDisassembler/MCRelocationInfo.h" #include "llvm/MC/MCDisassembler/MCSymbolizer.h" -#include <cstdint> #include <algorithm> +#include <cstdint> #include <memory> namespace llvm { @@ -39,6 +39,8 @@ class Twine; class AMDGPUDisassembler : public MCDisassembler { private: mutable ArrayRef<uint8_t> Bytes; + mutable uint32_t Literal; + mutable bool HasLiteral; public: AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : @@ -63,10 +65,14 @@ public: uint64_t Inst, uint64_t Address) const; + DecodeStatus convertSDWAInst(MCInst &MI) const; + MCOperand decodeOperand_VGPR_32(unsigned Val) const; MCOperand decodeOperand_VS_32(unsigned Val) const; MCOperand decodeOperand_VS_64(unsigned Val) const; + MCOperand decodeOperand_VS_128(unsigned Val) const; MCOperand decodeOperand_VSrc16(unsigned Val) const; + MCOperand decodeOperand_VSrcV216(unsigned Val) const; MCOperand decodeOperand_VReg_64(unsigned Val) const; MCOperand decodeOperand_VReg_96(unsigned Val) const; @@ -85,6 +91,7 @@ public: OPW64, OPW128, OPW16, + OPWV216, OPW_LAST_, OPW_FIRST_ = OPW32 }; @@ -100,6 +107,11 @@ public: MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const; MCOperand decodeSpecialReg32(unsigned Val) const; MCOperand decodeSpecialReg64(unsigned Val) const; + + MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeSDWASrc16(unsigned Val) const; + MCOperand decodeSDWASrc32(unsigned Val) const; + MCOperand decodeSDWAVopcDst(unsigned Val) const; }; //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 48c6592..5480110 100644 --- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -35,28 +35,59 @@ class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins, "MEM_RAT_CACHELESS "#name, pattern>; -class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name, - list<dag> pattern> - : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins, +class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins, + dag outs, string name, list<dag> pattern> + : EG_CF_RAT <0x56, rat_inst, rat_id, mask, outs, ins, "MEM_RAT "#name, pattern>; class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop> - : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr, - i32imm:$rat_id, InstFlag:$eop), + : CF_MEM_RAT <0x1, ?, 0xf, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr, + i32imm:$rat_id, InstFlag:$eop), (outs), "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr" #!if(has_eop, ", $eop", ""), [(int_r600_rat_store_typed R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr, (i32 imm:$rat_id))]>; -def RAT_MSKOR : CF_MEM_RAT <0x11, 0, - (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), +def RAT_MSKOR : CF_MEM_RAT <0x11, 0, 0xf, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), (outs), "MSKOR $rw_gpr.XW, $index_gpr", [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)] > { let eop = 0; } + +multiclass RAT_ATOMIC<bits<6> op_ret, bits<6> op_noret, string name> { + let Constraints = "$rw_gpr = $out_gpr", eop = 0, mayStore = 1 in { + def _RTN: CF_MEM_RAT <op_ret, 0, 0xf, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), + (outs R600_Reg128:$out_gpr), + name ## "_RTN" ## " $rw_gpr, $index_gpr", [] >; + def _NORET: CF_MEM_RAT <op_noret, 0, 0xf, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), + (outs R600_Reg128:$out_gpr), + name ## " $rw_gpr, $index_gpr", [] >; + } +} + +// Swap no-ret is just store. Raw store to cached target +// can only store on dword, which exactly matches swap_no_ret. +defm RAT_ATOMIC_XCHG_INT : RAT_ATOMIC<1, 34, "ATOMIC_XCHG_INT">; +defm RAT_ATOMIC_CMPXCHG_INT : RAT_ATOMIC<4, 36, "ATOMIC_CMPXCHG_INT">; +defm RAT_ATOMIC_ADD : RAT_ATOMIC<7, 39, "ATOMIC_ADD">; +defm RAT_ATOMIC_SUB : RAT_ATOMIC<8, 40, "ATOMIC_SUB">; +defm RAT_ATOMIC_RSUB : RAT_ATOMIC<9, 41, "ATOMIC_RSUB">; +defm RAT_ATOMIC_MIN_INT : RAT_ATOMIC<10, 42, "ATOMIC_MIN_INT">; +defm RAT_ATOMIC_MIN_UINT : RAT_ATOMIC<11, 43, "ATOMIC_MIN_UINT">; +defm RAT_ATOMIC_MAX_INT : RAT_ATOMIC<12, 44, "ATOMIC_MAX_INT">; +defm RAT_ATOMIC_MAX_UINT : RAT_ATOMIC<13, 45, "ATOMIC_MAX_UINT">; +defm RAT_ATOMIC_AND : RAT_ATOMIC<14, 46, "ATOMIC_AND">; +defm RAT_ATOMIC_OR : RAT_ATOMIC<15, 47, "ATOMIC_OR">; +defm RAT_ATOMIC_XOR : RAT_ATOMIC<16, 48, "ATOMIC_XOR">; +defm RAT_ATOMIC_INC_UINT : RAT_ATOMIC<18, 50, "ATOMIC_INC_UINT">; +defm RAT_ATOMIC_DEC_UINT : RAT_ATOMIC<19, 51, "ATOMIC_DEC_UINT">; + } // End let Predicates = [isEGorCayman] //===----------------------------------------------------------------------===// @@ -257,6 +288,76 @@ def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), let Predicates = [isEGorCayman] in { +multiclass AtomicPat<Instruction inst_ret, Instruction inst_noret, + SDPatternOperator node_ret, SDPatternOperator node_noret> { + // FIXME: Add _RTN version. We need per WI scratch location to store the old value + // EXTRACT_SUBREG here is dummy, we know the node has no uses + def : Pat<(i32 (node_noret i32:$ptr, i32:$data)), + (EXTRACT_SUBREG (inst_noret + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $data, sub0), $ptr), sub1)>; +} +multiclass AtomicIncDecPat<Instruction inst_ret, Instruction inst_noret, + SDPatternOperator node_ret, SDPatternOperator node_noret, int C> { + // FIXME: Add _RTN version. We need per WI scratch location to store the old value + // EXTRACT_SUBREG here is dummy, we know the node has no uses + def : Pat<(i32 (node_noret i32:$ptr, C)), + (EXTRACT_SUBREG (inst_noret + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (MOV_IMM_I32 -1), sub0), $ptr), sub1)>; +} + +// CMPSWAP is pattern is special +// EXTRACT_SUBREG here is dummy, we know the node has no uses +// FIXME: Add _RTN version. We need per WI scratch location to store the old value +def : Pat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$data)), + (EXTRACT_SUBREG (RAT_ATOMIC_CMPXCHG_INT_NORET + (INSERT_SUBREG + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $cmp, sub3), + $data, sub0), + $ptr), sub1)>; + +defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_RTN, + RAT_ATOMIC_XCHG_INT_NORET, + atomic_swap_global_ret, + atomic_swap_global_noret>; +defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_RTN, RAT_ATOMIC_ADD_NORET, + atomic_add_global_ret, atomic_add_global_noret>; +defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_RTN, RAT_ATOMIC_SUB_NORET, + atomic_sub_global_ret, atomic_sub_global_noret>; +defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_RTN, + RAT_ATOMIC_MIN_INT_NORET, + atomic_min_global_ret, atomic_min_global_noret>; +defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_RTN, + RAT_ATOMIC_MIN_UINT_NORET, + atomic_umin_global_ret, atomic_umin_global_noret>; +defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_RTN, + RAT_ATOMIC_MAX_INT_NORET, + atomic_max_global_ret, atomic_max_global_noret>; +defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_RTN, + RAT_ATOMIC_MAX_UINT_NORET, + atomic_umax_global_ret, atomic_umax_global_noret>; +defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_RTN, RAT_ATOMIC_AND_NORET, + atomic_and_global_ret, atomic_and_global_noret>; +defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_RTN, RAT_ATOMIC_OR_NORET, + atomic_or_global_ret, atomic_or_global_noret>; +defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_RTN, RAT_ATOMIC_XOR_NORET, + atomic_xor_global_ret, atomic_xor_global_noret>; +defm AtomicIncAddPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN, + RAT_ATOMIC_INC_UINT_NORET, + atomic_add_global_ret, + atomic_add_global_noret, 1>; +defm AtomicIncSubPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN, + RAT_ATOMIC_INC_UINT_NORET, + atomic_sub_global_ret, + atomic_sub_global_noret, -1>; +defm AtomicDecAddPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN, + RAT_ATOMIC_DEC_UINT_NORET, + atomic_add_global_ret, + atomic_add_global_noret, -1>; +defm AtomicDecSubPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN, + RAT_ATOMIC_DEC_UINT_NORET, + atomic_sub_global_ret, + atomic_sub_global_noret, 1>; + // Should be predicated on FeatureFP64 // def FMA_64 : R600_3OP < // 0xA, "FMA_64", @@ -287,7 +388,7 @@ def BFE_INT_eg : R600_3OP <0x5, "BFE_INT", VecALU >; -def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>; +defm : BFEPattern <BFE_UINT_eg, BFE_INT_eg, MOV_IMM_I32>; def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))], @@ -337,7 +438,7 @@ defm CUBE_eg : CUBE_Common<0xC0>; def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; -def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", fp_to_f16, VecALU>; +def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", AMDGPUfp_to_f16, VecALU>; def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>; def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>; diff --git a/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td index 849fb8a..edca6fc 100644 --- a/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -7,7 +7,8 @@ // //===----------------------------------------------------------------------===// -def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">; +def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [], -10>; +def FLATOffset : ComplexPattern<i64, 3, "SelectFlat", [], [], -10>; //===----------------------------------------------------------------------===// // FLAT classes @@ -30,8 +31,6 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, let VM_CNT = 1; let LGKM_CNT = 1; - let Uses = [EXEC, FLAT_SCR]; // M0 - let UseNamedOperandTable = 1; let hasSideEffects = 0; let SchedRW = [WriteVMEM]; @@ -39,10 +38,16 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, string Mnemonic = opName; string AsmOperands = asmOps; + bits<1> is_flat_global = 0; + bits<1> is_flat_scratch = 0; + bits<1> has_vdst = 1; bits<1> has_data = 1; bits<1> has_glc = 1; bits<1> glcValue = 0; + + // TODO: M0 if it could possibly access LDS (before gfx9? only)? + let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]); } class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : @@ -55,6 +60,8 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let AsmMatchConverter = ps.AsmMatchConverter; + let TSFlags = ps.TSFlags; + let UseNamedOperandTable = ps.UseNamedOperandTable; // encoding fields bits<8> vaddr; @@ -62,9 +69,27 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : bits<8> vdst; bits<1> slc; bits<1> glc; - bits<1> tfe; - // 15-0 is reserved. + // Only valid on gfx9 + bits<1> lds = 0; // XXX - What does this actually do? + + // Segment, 00=flat, 01=scratch, 10=global, 11=reserved + bits<2> seg = !if(ps.is_flat_global, 0b10, + !if(ps.is_flat_scratch, 0b01, 0)); + + // Signed offset. Highest bit ignored for flat and treated as 12-bit + // unsigned for flat acceses. + bits<13> offset; + bits<1> nv = 0; // XXX - What does this actually do? + + // We don't use tfe right now, and it was removed in gfx9. + bits<1> tfe = 0; + + // Only valid on GFX9+ + let Inst{12-0} = offset; + let Inst{13} = lds; + let Inst{15-14} = seg; + let Inst{16} = !if(ps.has_glc, glc, ps.glcValue); let Inst{17} = slc; let Inst{24-18} = op; @@ -72,41 +97,70 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : let Inst{39-32} = vaddr; let Inst{47-40} = !if(ps.has_data, vdata, ?); // 54-48 is reserved. - let Inst{55} = tfe; + let Inst{55} = nv; // nv on GFX9+, TFE before. let Inst{63-56} = !if(ps.has_vdst, vdst, ?); } -class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo< +class FLAT_Load_Pseudo <string opName, RegisterClass regClass, + bit HasSignedOffset = 0> : FLAT_Pseudo< opName, (outs regClass:$vdst), - (ins VReg_64:$vaddr, GLC:$glc, slc:$slc, tfe:$tfe), - " $vdst, $vaddr$glc$slc$tfe"> { + !if(HasSignedOffset, + (ins VReg_64:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc), + (ins VReg_64:$vaddr, offset_u12:$offset, GLC:$glc, slc:$slc)), + " $vdst, $vaddr$offset$glc$slc"> { let has_data = 0; let mayLoad = 1; } -class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass> : FLAT_Pseudo< +class FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass> : + FLAT_Load_Pseudo<opName, regClass, 1> { + let is_flat_global = 1; +} + +class FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> : + FLAT_Load_Pseudo<opName, regClass, 1> { + let is_flat_scratch = 1; +} + +class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, + bit HasSignedOffset = 0> : FLAT_Pseudo< opName, (outs), - (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc, tfe:$tfe), - " $vaddr, $vdata$glc$slc$tfe"> { + !if(HasSignedOffset, + (ins VReg_64:$vaddr, vdataClass:$vdata, offset_s13:$offset, GLC:$glc, slc:$slc), + (ins VReg_64:$vaddr, vdataClass:$vdata, offset_u12:$offset, GLC:$glc, slc:$slc)), + " $vaddr, $vdata$offset$glc$slc"> { let mayLoad = 0; let mayStore = 1; let has_vdst = 0; } +class FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> : + FLAT_Store_Pseudo<opName, regClass, 1> { + let is_flat_global = 1; +} + +class FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> : + FLAT_Store_Pseudo<opName, regClass, 1> { + let is_flat_scratch = 1; +} + multiclass FLAT_Atomic_Pseudo< string opName, RegisterClass vdst_rc, ValueType vt, SDPatternOperator atomic = null_frag, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc> { + RegisterClass data_rc = vdst_rc, + bit HasSignedOffset = 0> { def "" : FLAT_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe), - " $vaddr, $vdata$slc$tfe", + !if(HasSignedOffset, + (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc)), + " $vaddr, $vdata$offset$slc", []>, AtomicNoRet <NAME, 0> { let mayLoad = 1; @@ -119,10 +173,12 @@ multiclass FLAT_Atomic_Pseudo< def _RTN : FLAT_Pseudo <opName, (outs vdst_rc:$vdst), - (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe), - " $vdst, $vaddr, $vdata glc$slc$tfe", + !if(HasSignedOffset, + (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc)), + " $vdst, $vaddr, $vdata$offset glc$slc", [(set vt:$vdst, - (atomic (FLATAtomic i64:$vaddr, i1:$slc, i1:$tfe), data_vt:$vdata))]>, + (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, AtomicNoRet <NAME, 1> { let mayLoad = 1; let mayStore = 1; @@ -136,7 +192,7 @@ multiclass FLAT_Atomic_Pseudo< class flat_binary_atomic_op<SDNode atomic_op> : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}] + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.FLAT_ADDRESS;}] >; def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>; @@ -277,6 +333,26 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", } // End SubtargetPredicate = isCI +let SubtargetPredicate = HasFlatGlobalInsts in { +def GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; +def GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>; +def GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>; +def GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>; +def GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>; +def GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>; +def GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>; +def GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>; + +def GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>; +def GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>; +def GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>; +def GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>; +def GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>; +def GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>; + +} // End SubtargetPredicate = HasFlatGlobalInsts + + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// @@ -284,16 +360,16 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ auto const AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::GLOBAL_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS; + return AS == AMDGPUASI.FLAT_ADDRESS || + AS == AMDGPUASI.GLOBAL_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS; }]>; class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr), (st node:$val, node:$ptr), [{ auto const AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::GLOBAL_ADDRESS; + return AS == AMDGPUASI.FLAT_ADDRESS || + AS == AMDGPUASI.GLOBAL_ADDRESS; }]>; def atomic_flat_load : flat_ld <atomic_load>; @@ -310,31 +386,31 @@ def flat_truncstorei16 : flat_st <truncstorei16>; // Patterns for global loads with no offset. class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr)), - (inst $addr, 0, 0, 0) + (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))), + (inst $vaddr, $offset, 0, $slc) >; class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr)), - (inst $addr, 1, 0, 0) + (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))), + (inst $vaddr, $offset, 1, $slc) >; class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < - (node vt:$data, i64:$addr), - (inst $addr, $data, 0, 0, 0) + (node vt:$data, (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc)), + (inst $vaddr, $data, $offset, 0, $slc) >; class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < // atomic store follows atomic binop convention so the address comes // first. - (node i64:$addr, vt:$data), - (inst $addr, $data, 1, 0, 0) + (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), + (inst $vaddr, $data, $offset, 1, $slc) >; class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : Pat < - (vt (node i64:$addr, data_vt:$data)), - (inst $addr, $data, 0, 0) + (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)), + (inst $vaddr, $data, $offset, $slc) >; let Predicates = [isCIVI] in { @@ -528,3 +604,18 @@ defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>; defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>; defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>; +def GLOBAL_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, GLOBAL_LOAD_UBYTE>; +def GLOBAL_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, GLOBAL_LOAD_SBYTE>; +def GLOBAL_LOAD_USHORT_vi : FLAT_Real_vi <0x12, GLOBAL_LOAD_USHORT>; +def GLOBAL_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, GLOBAL_LOAD_SSHORT>; +def GLOBAL_LOAD_DWORD_vi : FLAT_Real_vi <0x14, GLOBAL_LOAD_DWORD>; +def GLOBAL_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, GLOBAL_LOAD_DWORDX2>; +def GLOBAL_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, GLOBAL_LOAD_DWORDX4>; +def GLOBAL_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, GLOBAL_LOAD_DWORDX3>; + +def GLOBAL_STORE_BYTE_vi : FLAT_Real_vi <0x18, GLOBAL_STORE_BYTE>; +def GLOBAL_STORE_SHORT_vi : FLAT_Real_vi <0x1a, GLOBAL_STORE_SHORT>; +def GLOBAL_STORE_DWORD_vi : FLAT_Real_vi <0x1c, GLOBAL_STORE_DWORD>; +def GLOBAL_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, GLOBAL_STORE_DWORDX2>; +def GLOBAL_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, GLOBAL_STORE_DWORDX4>; +def GLOBAL_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, GLOBAL_STORE_DWORDX3>; diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index dd3b46f..cd9e7fb 100644 --- a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -13,9 +13,22 @@ #include "GCNHazardRecognizer.h" #include "AMDGPUSubtarget.h" +#include "SIDefines.h" #include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/ScheduleDAG.h" -#include "llvm/Support/Debug.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/ErrorHandling.h" +#include <algorithm> +#include <cassert> +#include <limits> +#include <set> +#include <vector> using namespace llvm; @@ -26,7 +39,8 @@ using namespace llvm; GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : CurrCycleInstr(nullptr), MF(MF), - ST(MF.getSubtarget<SISubtarget>()) { + ST(MF.getSubtarget<SISubtarget>()), + TII(*ST.getInstrInfo()) { MaxLookAhead = 5; } @@ -58,8 +72,19 @@ static bool isRFE(unsigned Opcode) { return Opcode == AMDGPU::S_RFE_B64; } -static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { +static bool isSMovRel(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::S_MOVRELS_B32: + case AMDGPU::S_MOVRELS_B64: + case AMDGPU::S_MOVRELD_B32: + case AMDGPU::S_MOVRELD_B64: + return true; + default: + return false; + } +} +static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, AMDGPU::OpName::simm16); return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; @@ -96,6 +121,13 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) return NoopHazard; + if ((TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && + checkReadM0Hazards(MI) > 0) + return NoopHazard; + + if (checkAnyInstHazards(MI) > 0) + return NoopHazard; + return NoHazard; } @@ -104,11 +136,13 @@ unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { } unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { + int WaitStates = std::max(0, checkAnyInstHazards(MI)); + if (SIInstrInfo::isSMRD(*MI)) - return std::max(0, checkSMRDHazards(MI)); + return std::max(WaitStates, checkSMRDHazards(MI)); if (SIInstrInfo::isVALU(*MI)) { - int WaitStates = std::max(0, checkVALUHazards(MI)); + WaitStates = std::max(WaitStates, checkVALUHazards(MI)); if (SIInstrInfo::isVMEM(*MI)) WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); @@ -122,19 +156,25 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { if (isRWLane(MI->getOpcode())) WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); + if (TII.isVINTRP(*MI)) + WaitStates = std::max(WaitStates, checkReadM0Hazards(MI)); + return WaitStates; } if (isSGetReg(MI->getOpcode())) - return std::max(0, checkGetRegHazards(MI)); + return std::max(WaitStates, checkGetRegHazards(MI)); if (isSSetReg(MI->getOpcode())) - return std::max(0, checkSetRegHazards(MI)); + return std::max(WaitStates, checkSetRegHazards(MI)); if (isRFE(MI->getOpcode())) - return std::max(0, checkRFEHazards(MI)); + return std::max(WaitStates, checkRFEHazards(MI)); - return 0; + if (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) + return std::max(WaitStates, checkReadM0Hazards(MI)); + + return WaitStates; } void GCNHazardRecognizer::EmitNoop() { @@ -142,14 +182,12 @@ void GCNHazardRecognizer::EmitNoop() { } void GCNHazardRecognizer::AdvanceCycle() { - // When the scheduler detects a stall, it will call AdvanceCycle() without // emitting any instructions. if (!CurrCycleInstr) return; - const SIInstrInfo *TII = ST.getInstrInfo(); - unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr); + unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); // Keep track of emitted instructions EmittedInstrs.push_front(CurrCycleInstr); @@ -180,7 +218,6 @@ void GCNHazardRecognizer::RecedeCycle() { int GCNHazardRecognizer::getWaitStatesSince( function_ref<bool(MachineInstr *)> IsHazard) { - int WaitStates = -1; for (MachineInstr *MI : EmittedInstrs) { ++WaitStates; @@ -204,7 +241,6 @@ int GCNHazardRecognizer::getWaitStatesSinceDef( int GCNHazardRecognizer::getWaitStatesSinceSetReg( function_ref<bool(MachineInstr *)> IsHazard) { - auto IsHazardFn = [IsHazard] (MachineInstr *MI) { return isSSetReg(MI->getOpcode()) && IsHazard(MI); }; @@ -281,7 +317,6 @@ int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) { int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); int WaitStatesNeeded = 0; WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD); @@ -293,7 +328,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { // A read of an SGPR by SMRD instruction requires 4 wait states when the // SGPR was written by a VALU instruction. int SmrdSgprWaitStates = 4; - auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; for (const MachineOperand &Use : SMRD->uses()) { if (!Use.isReg()) @@ -486,7 +521,6 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { } int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { - if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return 0; @@ -500,3 +534,42 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); return RFEWaitStates - WaitStatesNeeded; } + +int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) { + if (MI->isDebugValue()) + return 0; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + if (!ST.hasSMovFedHazard()) + return 0; + + // Check for any instruction reading an SGPR after a write from + // s_mov_fed_b32. + int MovFedWaitStates = 1; + int WaitStatesNeeded = 0; + + for (const MachineOperand &Use : MI->uses()) { + if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg())) + continue; + auto IsHazardFn = [] (MachineInstr *MI) { + return MI->getOpcode() == AMDGPU::S_MOV_FED_B32; + }; + int WaitStatesNeededForUse = + MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { + if (!ST.hasReadM0Hazard()) + return 0; + + const SIInstrInfo *TII = ST.getInstrInfo(); + int SMovRelWaitStates = 1; + auto IsHazardFn = [TII] (MachineInstr *MI) { + return TII->isSALU(*MI); + }; + return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 0ab82ff..5680c3d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -34,6 +34,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { std::list<MachineInstr*> EmittedInstrs; const MachineFunction &MF; const SISubtarget &ST; + const SIInstrInfo &TII; int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard); int getWaitStatesSinceDef(unsigned Reg, @@ -52,6 +53,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { int checkVALUHazards(MachineInstr *VALU); int checkRWLaneHazards(MachineInstr *RWLane); int checkRFEHazards(MachineInstr *RFE); + int checkAnyInstHazards(MachineInstr *MI); + int checkReadM0Hazards(MachineInstr *SMovRel); public: GCNHazardRecognizer(const MachineFunction &MF); // We can only issue one instruction per cycle. diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp new file mode 100644 index 0000000..2e7641c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -0,0 +1,530 @@ +//===--------------------- GCNIterativeScheduler.cpp - --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "GCNIterativeScheduler.h" +#include "GCNSchedStrategy.h" +#include "SIMachineFunctionInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-scheduler" + +namespace llvm { + std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots, + const ScheduleDAG &DAG); +} + +// shim accessors for different order containers +static inline MachineInstr *getMachineInstr(MachineInstr *MI) { + return MI; +} +static inline MachineInstr *getMachineInstr(const SUnit *SU) { + return SU->getInstr(); +} +static inline MachineInstr *getMachineInstr(const SUnit &SU) { + return SU.getInstr(); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +static void printRegion(raw_ostream &OS, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + const LiveIntervals *LIS, + unsigned MaxInstNum = + std::numeric_limits<unsigned>::max()) { + auto BB = Begin->getParent(); + OS << BB->getParent()->getName() << ":BB#" << BB->getNumber() + << ' ' << BB->getName() << ":\n"; + auto I = Begin; + MaxInstNum = std::max(MaxInstNum, 1u); + for (; I != End && MaxInstNum; ++I, --MaxInstNum) { + if (!I->isDebugValue() && LIS) + OS << LIS->getInstructionIndex(*I); + OS << '\t' << *I; + } + if (I != End) { + OS << "\t...\n"; + I = std::prev(End); + if (!I->isDebugValue() && LIS) + OS << LIS->getInstructionIndex(*I); + OS << '\t' << *I; + } + if (End != BB->end()) { // print boundary inst if present + OS << "----\n"; + if (LIS) OS << LIS->getInstructionIndex(*End) << '\t'; + OS << *End; + } +} + +LLVM_DUMP_METHOD +static void printLivenessInfo(raw_ostream &OS, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + const LiveIntervals *LIS) { + const auto BB = Begin->getParent(); + const auto &MRI = BB->getParent()->getRegInfo(); + + const auto LiveIns = getLiveRegsBefore(*Begin, *LIS); + OS << "LIn RP: "; + getRegPressure(MRI, LiveIns).print(OS); + + const auto BottomMI = End == BB->end() ? std::prev(End) : End; + const auto LiveOuts = getLiveRegsAfter(*BottomMI, *LIS); + OS << "LOt RP: "; + getRegPressure(MRI, LiveOuts).print(OS); +} + +LLVM_DUMP_METHOD +void GCNIterativeScheduler::printRegions(raw_ostream &OS) const { + const auto &ST = MF.getSubtarget<SISubtarget>(); + for (const auto R : Regions) { + OS << "Region to schedule "; + printRegion(OS, R->Begin, R->End, LIS, 1); + printLivenessInfo(OS, R->Begin, R->End, LIS); + OS << "Max RP: "; + R->MaxPressure.print(OS, &ST); + } +} + +LLVM_DUMP_METHOD +void GCNIterativeScheduler::printSchedResult(raw_ostream &OS, + const Region *R, + const GCNRegPressure &RP) const { + OS << "\nAfter scheduling "; + printRegion(OS, R->Begin, R->End, LIS); + printSchedRP(OS, R->MaxPressure, RP); + OS << '\n'; +} + +LLVM_DUMP_METHOD +void GCNIterativeScheduler::printSchedRP(raw_ostream &OS, + const GCNRegPressure &Before, + const GCNRegPressure &After) const { + const auto &ST = MF.getSubtarget<SISubtarget>(); + OS << "RP before: "; + Before.print(OS, &ST); + OS << "RP after: "; + After.print(OS, &ST); +} + +#endif + +// DAG builder helper +class GCNIterativeScheduler::BuildDAG { + GCNIterativeScheduler &Sch; + SmallVector<SUnit*, 8> TopRoots; +public: + BuildDAG(const Region &R, GCNIterativeScheduler &_Sch) + : Sch(_Sch) { + auto BB = R.Begin->getParent(); + Sch.BaseClass::startBlock(BB); + Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs); + + Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr, + /*TrackLaneMask*/true); + Sch.Topo.InitDAGTopologicalSorting(); + + SmallVector<SUnit*, 8> BotRoots; + Sch.findRootsAndBiasEdges(TopRoots, BotRoots); + } + ~BuildDAG() { + Sch.BaseClass::exitRegion(); + Sch.BaseClass::finishBlock(); + } + ArrayRef<const SUnit*> getTopRoots() const { + return TopRoots; + } +}; + +class GCNIterativeScheduler::OverrideLegacyStrategy { + GCNIterativeScheduler &Sch; + Region &Rgn; + std::unique_ptr<MachineSchedStrategy> SaveSchedImpl; + GCNRegPressure SaveMaxRP; +public: + OverrideLegacyStrategy(Region &R, + MachineSchedStrategy &OverrideStrategy, + GCNIterativeScheduler &_Sch) + : Sch(_Sch) + , Rgn(R) + , SaveSchedImpl(std::move(_Sch.SchedImpl)) + , SaveMaxRP(R.MaxPressure) { + Sch.SchedImpl.reset(&OverrideStrategy); + auto BB = R.Begin->getParent(); + Sch.BaseClass::startBlock(BB); + Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs); + } + ~OverrideLegacyStrategy() { + Sch.BaseClass::exitRegion(); + Sch.BaseClass::finishBlock(); + Sch.SchedImpl.release(); + Sch.SchedImpl = std::move(SaveSchedImpl); + } + void schedule() { + assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End); + DEBUG(dbgs() << "\nScheduling "; + printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2)); + Sch.BaseClass::schedule(); + + // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore + Sch.RegionEnd = Rgn.End; + //assert(Rgn.End == Sch.RegionEnd); + Rgn.Begin = Sch.RegionBegin; + Rgn.MaxPressure.clear(); + } + void restoreOrder() { + assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End); + // DAG SUnits are stored using original region's order + // so just use SUnits as the restoring schedule + Sch.scheduleRegion(Rgn, Sch.SUnits, SaveMaxRP); + } +}; + +namespace { +// just a stub to make base class happy +class SchedStrategyStub : public MachineSchedStrategy { +public: + bool shouldTrackPressure() const override { return false; } + bool shouldTrackLaneMasks() const override { return false; } + void initialize(ScheduleDAGMI *DAG) override {} + SUnit *pickNode(bool &IsTopNode) override { return nullptr; } + void schedNode(SUnit *SU, bool IsTopNode) override {} + void releaseTopNode(SUnit *SU) override {} + void releaseBottomNode(SUnit *SU) override {} +}; +} // namespace + +GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C, + StrategyKind S) + : BaseClass(C, llvm::make_unique<SchedStrategyStub>()) + , Context(C) + , Strategy(S) + , UPTracker(*LIS) { +} + +// returns max pressure for a region +GCNRegPressure +GCNIterativeScheduler::getRegionPressure(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End) + const { + // For the purpose of pressure tracking bottom inst of the region should + // be also processed. End is either BB end, BB terminator inst or sched + // boundary inst. + auto const BBEnd = Begin->getParent()->end(); + auto const BottomMI = End == BBEnd ? std::prev(End) : End; + + // scheduleRegions walks bottom to top, so its likely we just get next + // instruction to track + auto AfterBottomMI = std::next(BottomMI); + if (AfterBottomMI == BBEnd || + &*AfterBottomMI != UPTracker.getLastTrackedMI()) { + UPTracker.reset(*BottomMI); + } else { + assert(UPTracker.isValid()); + } + + for (auto I = BottomMI; I != Begin; --I) + UPTracker.recede(*I); + + UPTracker.recede(*Begin); + + assert(UPTracker.isValid() || + (dbgs() << "Tracked region ", + printRegion(dbgs(), Begin, End, LIS), false)); + return UPTracker.moveMaxPressure(); +} + +// returns max pressure for a tentative schedule +template <typename Range> GCNRegPressure +GCNIterativeScheduler::getSchedulePressure(const Region &R, + Range &&Schedule) const { + auto const BBEnd = R.Begin->getParent()->end(); + GCNUpwardRPTracker RPTracker(*LIS); + if (R.End != BBEnd) { + // R.End points to the boundary instruction but the + // schedule doesn't include it + RPTracker.reset(*R.End); + RPTracker.recede(*R.End); + } else { + // R.End doesn't point to the boundary instruction + RPTracker.reset(*std::prev(BBEnd)); + } + for (auto I = Schedule.end(), B = Schedule.begin(); I != B;) { + RPTracker.recede(*getMachineInstr(*--I)); + } + return RPTracker.moveMaxPressure(); +} + +void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) { + BaseClass::enterRegion(BB, Begin, End, NumRegionInstrs); + if (NumRegionInstrs > 2) { + Regions.push_back( + new (Alloc.Allocate()) + Region { Begin, End, NumRegionInstrs, + getRegionPressure(Begin, End), nullptr }); + } +} + +void GCNIterativeScheduler::schedule() { // overriden + // do nothing + DEBUG( + printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS); + if (!Regions.empty() && Regions.back()->Begin == RegionBegin) { + dbgs() << "Max RP: "; + Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>()); + } + dbgs() << '\n'; + ); +} + +void GCNIterativeScheduler::finalizeSchedule() { // overriden + if (Regions.empty()) + return; + switch (Strategy) { + case SCHEDULE_MINREGONLY: scheduleMinReg(); break; + case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break; + case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break; + } +} + +// Detach schedule from SUnits and interleave it with debug values. +// Returned schedule becomes independent of DAG state. +std::vector<MachineInstr*> +GCNIterativeScheduler::detachSchedule(ScheduleRef Schedule) const { + std::vector<MachineInstr*> Res; + Res.reserve(Schedule.size() * 2); + + if (FirstDbgValue) + Res.push_back(FirstDbgValue); + + const auto DbgB = DbgValues.begin(), DbgE = DbgValues.end(); + for (auto SU : Schedule) { + Res.push_back(SU->getInstr()); + const auto &D = std::find_if(DbgB, DbgE, [SU](decltype(*DbgB) &P) { + return P.second == SU->getInstr(); + }); + if (D != DbgE) + Res.push_back(D->first); + } + return Res; +} + +void GCNIterativeScheduler::setBestSchedule(Region &R, + ScheduleRef Schedule, + const GCNRegPressure &MaxRP) { + R.BestSchedule.reset( + new TentativeSchedule{ detachSchedule(Schedule), MaxRP }); +} + +void GCNIterativeScheduler::scheduleBest(Region &R) { + assert(R.BestSchedule.get() && "No schedule specified"); + scheduleRegion(R, R.BestSchedule->Schedule, R.BestSchedule->MaxPressure); + R.BestSchedule.reset(); +} + +// minimal required region scheduler, works for ranges of SUnits*, +// SUnits or MachineIntrs* +template <typename Range> +void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule, + const GCNRegPressure &MaxRP) { + assert(RegionBegin == R.Begin && RegionEnd == R.End); + assert(LIS != nullptr); +#ifndef NDEBUG + const auto SchedMaxRP = getSchedulePressure(R, Schedule); +#endif + auto BB = R.Begin->getParent(); + auto Top = R.Begin; + for (const auto &I : Schedule) { + auto MI = getMachineInstr(I); + if (MI != &*Top) { + BB->remove(MI); + BB->insert(Top, MI); + if (!MI->isDebugValue()) + LIS->handleMove(*MI, true); + } + if (!MI->isDebugValue()) { + // Reset read - undef flags and update them later. + for (auto &Op : MI->operands()) + if (Op.isReg() && Op.isDef()) + Op.setIsUndef(false); + + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, MRI, /*ShouldTrackLaneMasks*/true, + /*IgnoreDead*/false); + // Adjust liveness and add missing dead+read-undef flags. + auto SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); + } + Top = std::next(MI->getIterator()); + } + RegionBegin = getMachineInstr(Schedule.front()); + + // Schedule consisting of MachineInstr* is considered 'detached' + // and already interleaved with debug values + if (!std::is_same<decltype(*Schedule.begin()), MachineInstr*>::value) { + placeDebugValues(); + // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore + //assert(R.End == RegionEnd); + RegionEnd = R.End; + } + + R.Begin = RegionBegin; + R.MaxPressure = MaxRP; + +#ifndef NDEBUG + const auto RegionMaxRP = getRegionPressure(R); + const auto &ST = MF.getSubtarget<SISubtarget>(); +#endif + assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP)) + || (dbgs() << "Max RP mismatch!!!\n" + "RP for schedule (calculated): ", + SchedMaxRP.print(dbgs(), &ST), + dbgs() << "RP for schedule (reported): ", + MaxRP.print(dbgs(), &ST), + dbgs() << "RP after scheduling: ", + RegionMaxRP.print(dbgs(), &ST), + false)); +} + +// Sort recorded regions by pressure - highest at the front +void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) { + const auto &ST = MF.getSubtarget<SISubtarget>(); + std::sort(Regions.begin(), Regions.end(), + [&ST, TargetOcc](const Region *R1, const Region *R2) { + return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc); + }); +} + +/////////////////////////////////////////////////////////////////////////////// +// Legacy MaxOccupancy Strategy + +// Tries to increase occupancy applying minreg scheduler for a sequence of +// most demanding regions. Obtained schedules are saved as BestSchedule for a +// region. +// TargetOcc is the best achievable occupancy for a kernel. +// Returns better occupancy on success or current occupancy on fail. +// BestSchedules aren't deleted on fail. +unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { + // TODO: assert Regions are sorted descending by pressure + const auto &ST = MF.getSubtarget<SISubtarget>(); + const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); + DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc + << ", current = " << Occ << '\n'); + + auto NewOcc = TargetOcc; + for (auto R : Regions) { + if (R->MaxPressure.getOccupancy(ST) >= NewOcc) + break; + + DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); + printLivenessInfo(dbgs(), R->Begin, R->End, LIS)); + + BuildDAG DAG(*R, *this); + const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); + const auto MaxRP = getSchedulePressure(*R, MinSchedule); + DEBUG(dbgs() << "Occupancy improvement attempt:\n"; + printSchedRP(dbgs(), R->MaxPressure, MaxRP)); + + NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST)); + if (NewOcc <= Occ) + break; + + setBestSchedule(*R, MinSchedule, MaxRP); + } + DEBUG(dbgs() << "New occupancy = " << NewOcc + << ", prev occupancy = " << Occ << '\n'); + return std::max(NewOcc, Occ); +} + +void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( + bool TryMaximizeOccupancy) { + const auto &ST = MF.getSubtarget<SISubtarget>(); + auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + + sortRegionsByPressure(TgtOcc); + auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); + + if (TryMaximizeOccupancy && Occ < TgtOcc) + Occ = tryMaximizeOccupancy(TgtOcc); + + // This is really weird but for some magic scheduling regions twice + // gives performance improvement + const int NumPasses = Occ < TgtOcc ? 2 : 1; + + TgtOcc = std::min(Occ, TgtOcc); + DEBUG(dbgs() << "Scheduling using default scheduler, " + "target occupancy = " << TgtOcc << '\n'); + GCNMaxOccupancySchedStrategy LStrgy(Context); + + for (int I = 0; I < NumPasses; ++I) { + // running first pass with TargetOccupancy = 0 mimics previous scheduling + // approach and is a performance magic + LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc); + for (auto R : Regions) { + OverrideLegacyStrategy Ovr(*R, LStrgy, *this); + + Ovr.schedule(); + const auto RP = getRegionPressure(*R); + DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); + + if (RP.getOccupancy(ST) < TgtOcc) { + DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); + if (R->BestSchedule.get() && + R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) { + DEBUG(dbgs() << ", scheduling minimal register\n"); + scheduleBest(*R); + } else { + DEBUG(dbgs() << ", restoring\n"); + Ovr.restoreOrder(); + assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc); + } + } + } + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Minimal Register Strategy + +void GCNIterativeScheduler::scheduleMinReg(bool force) { + const auto &ST = MF.getSubtarget<SISubtarget>(); + const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + sortRegionsByPressure(TgtOcc); + + auto MaxPressure = Regions.front()->MaxPressure; + for (auto R : Regions) { + if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc)) + break; + + BuildDAG DAG(*R, *this); + const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); + + const auto RP = getSchedulePressure(*R, MinSchedule); + DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) { + dbgs() << "\nWarning: Pressure becomes worse after minreg!"; + printSchedRP(dbgs(), R->MaxPressure, RP); + }); + + if (!force && MaxPressure.less(ST, RP, TgtOcc)) + break; + + scheduleRegion(*R, MinSchedule, RP); + DEBUG(printSchedResult(dbgs(), R, RP)); + + MaxPressure = RP; + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h b/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h new file mode 100644 index 0000000..df3afce --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h @@ -0,0 +1,118 @@ +//===--------- GCNIterativeScheduler.h - GCN Scheduler -*- C++ -*----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H +#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H + +#include "GCNRegPressure.h" + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class GCNIterativeScheduler : public ScheduleDAGMILive { + typedef ScheduleDAGMILive BaseClass; +public: + enum StrategyKind { + SCHEDULE_MINREGONLY, + SCHEDULE_MINREGFORCED, + SCHEDULE_LEGACYMAXOCCUPANCY + }; + + GCNIterativeScheduler(MachineSchedContext *C, + StrategyKind S); + + void schedule() override; + + void enterRegion(MachineBasicBlock *BB, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned RegionInstrs) override; + + void finalizeSchedule() override; + +protected: + + typedef ArrayRef<const SUnit*> ScheduleRef; + + struct TentativeSchedule { + std::vector<MachineInstr*> Schedule; + GCNRegPressure MaxPressure; + }; + + struct Region { + // Fields except for BestSchedule are supposed to reflect current IR state + // `const` fields are to emphasize they shouldn't change for any schedule. + MachineBasicBlock::iterator Begin; + // End is either a boundary instruction or end of basic block + const MachineBasicBlock::iterator End; + const unsigned NumRegionInstrs; + GCNRegPressure MaxPressure; + + // best schedule for the region so far (not scheduled yet) + std::unique_ptr<TentativeSchedule> BestSchedule; + }; + + SpecificBumpPtrAllocator<Region> Alloc; + std::vector<Region*> Regions; + + MachineSchedContext *Context; + const StrategyKind Strategy; + mutable GCNUpwardRPTracker UPTracker; + + class BuildDAG; + class OverrideLegacyStrategy; + + template <typename Range> + GCNRegPressure getSchedulePressure(const Region &R, + Range &&Schedule) const; + + GCNRegPressure getRegionPressure(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End) const; + + GCNRegPressure getRegionPressure(const Region &R) const { + return getRegionPressure(R.Begin, R.End); + } + + void setBestSchedule(Region &R, + ScheduleRef Schedule, + const GCNRegPressure &MaxRP = GCNRegPressure()); + + void scheduleBest(Region &R); + + std::vector<MachineInstr*> detachSchedule(ScheduleRef Schedule) const; + + void sortRegionsByPressure(unsigned TargetOcc); + + template <typename Range> + void scheduleRegion(Region &R, Range &&Schedule, + const GCNRegPressure &MaxRP = GCNRegPressure()); + + unsigned tryMaximizeOccupancy(unsigned TargetOcc = + std::numeric_limits<unsigned>::max()); + + void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true); + void scheduleMinReg(bool force = false); + + void printRegions(raw_ostream &OS) const; + void printSchedResult(raw_ostream &OS, + const Region *R, + const GCNRegPressure &RP) const; + void printSchedRP(raw_ostream &OS, + const GCNRegPressure &Before, + const GCNRegPressure &After) const; +}; + +} // End namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp new file mode 100644 index 0000000..0657f67 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -0,0 +1,268 @@ +//===----------------------- GCNMinRegStrategy.cpp - ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ScheduleDAG.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-scheduler" + +namespace { +class GCNMinRegScheduler { + struct Candidate : ilist_node<Candidate> { + const SUnit *SU; + int Priority; + + Candidate(const SUnit *SU_, int Priority_ = 0) + : SU(SU_), Priority(Priority_) {} + }; + + SpecificBumpPtrAllocator<Candidate> Alloc; + typedef simple_ilist<Candidate> Queue; + Queue RQ; // Ready queue + + std::vector<unsigned> NumPreds; + + bool isScheduled(const SUnit *SU) const { + assert(!SU->isBoundaryNode()); + return NumPreds[SU->NodeNum] == std::numeric_limits<unsigned>::max(); + } + + void setIsScheduled(const SUnit *SU) { + assert(!SU->isBoundaryNode()); + NumPreds[SU->NodeNum] = std::numeric_limits<unsigned>::max(); + } + + unsigned getNumPreds(const SUnit *SU) const { + assert(!SU->isBoundaryNode()); + assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max()); + return NumPreds[SU->NodeNum]; + } + + unsigned decNumPreds(const SUnit *SU) { + assert(!SU->isBoundaryNode()); + assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max()); + return --NumPreds[SU->NodeNum]; + } + + void initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits); + + int getReadySuccessors(const SUnit *SU) const; + int getNotReadySuccessors(const SUnit *SU) const; + + template <typename Calc> + unsigned findMax(unsigned Num, Calc C); + + Candidate* pickCandidate(); + + void bumpPredsPriority(const SUnit *SchedSU, int Priority); + void releaseSuccessors(const SUnit* SU, int Priority); + +public: + std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots, + const ScheduleDAG &DAG); +}; +} // namespace + +void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) { + NumPreds.resize(SUnits.size()); + for (unsigned I = 0; I < SUnits.size(); ++I) + NumPreds[I] = SUnits[I].NumPredsLeft; +} + +int GCNMinRegScheduler::getReadySuccessors(const SUnit *SU) const { + unsigned NumSchedSuccs = 0; + for (auto SDep : SU->Succs) { + bool wouldBeScheduled = true; + for (auto PDep : SDep.getSUnit()->Preds) { + auto PSU = PDep.getSUnit(); + assert(!PSU->isBoundaryNode()); + if (PSU != SU && !isScheduled(PSU)) { + wouldBeScheduled = false; + break; + } + } + NumSchedSuccs += wouldBeScheduled ? 1 : 0; + } + return NumSchedSuccs; +} + +int GCNMinRegScheduler::getNotReadySuccessors(const SUnit *SU) const { + return SU->Succs.size() - getReadySuccessors(SU); +} + +template <typename Calc> +unsigned GCNMinRegScheduler::findMax(unsigned Num, Calc C) { + assert(!RQ.empty() && Num <= RQ.size()); + typedef decltype(C(*RQ.begin())) T; + T Max = std::numeric_limits<T>::min(); + unsigned NumMax = 0; + for (auto I = RQ.begin(); Num; --Num) { + T Cur = C(*I); + if (Cur >= Max) { + if (Cur > Max) { + Max = Cur; + NumMax = 1; + } else + ++NumMax; + auto &Cand = *I++; + RQ.remove(Cand); + RQ.push_front(Cand); + continue; + } + ++I; + } + return NumMax; +} + +GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() { + do { + unsigned Num = RQ.size(); + if (Num == 1) break; + + DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { return C.Priority; }); + if (Num == 1) break; + + DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among " + << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { + auto SU = C.SU; + int Res = getNotReadySuccessors(SU); + DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready " + << Res << " successors, metric = " << -Res << '\n'); + return -Res; + }); + if (Num == 1) break; + + DEBUG(dbgs() << "\nSelecting most producing candidate among " + << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { + auto SU = C.SU; + auto Res = getReadySuccessors(SU); + DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready " + << Res << " successors, metric = " << Res << '\n'); + return Res; + }); + if (Num == 1) break; + + Num = Num ? Num : RQ.size(); + DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among " + << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; }); + assert(Num == 1); + } while (false); + + return &RQ.front(); +} + +void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) { + SmallPtrSet<const SUnit*, 32> Set; + for (const auto &S : SchedSU->Succs) { + if (S.getSUnit()->isBoundaryNode() || isScheduled(S.getSUnit()) || + S.getKind() != SDep::Data) + continue; + for (const auto &P : S.getSUnit()->Preds) { + auto PSU = P.getSUnit(); + assert(!PSU->isBoundaryNode()); + if (PSU != SchedSU && !isScheduled(PSU)) { + Set.insert(PSU); + } + } + } + SmallVector<const SUnit*, 32> Worklist(Set.begin(), Set.end()); + while (!Worklist.empty()) { + auto SU = Worklist.pop_back_val(); + assert(!SU->isBoundaryNode()); + for (const auto &P : SU->Preds) { + if (!P.getSUnit()->isBoundaryNode() && !isScheduled(P.getSUnit()) && + Set.insert(P.getSUnit()).second) + Worklist.push_back(P.getSUnit()); + } + } + DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum + << ")'s non-ready successors of " << Priority + << " priority in ready queue: "); + const auto SetEnd = Set.end(); + for (auto &C : RQ) { + if (Set.find(C.SU) != SetEnd) { + C.Priority = Priority; + DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')'); + } + } + DEBUG(dbgs() << '\n'); +} + +void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) { + for (const auto &S : SU->Succs) { + auto SuccSU = S.getSUnit(); + if (S.isWeak()) + continue; + assert(SuccSU->isBoundaryNode() || getNumPreds(SuccSU) > 0); + if (!SuccSU->isBoundaryNode() && decNumPreds(SuccSU) == 0) + RQ.push_front(*new (Alloc.Allocate()) Candidate(SuccSU, Priority)); + } +} + +std::vector<const SUnit*> +GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots, + const ScheduleDAG &DAG) { + const auto &SUnits = DAG.SUnits; + std::vector<const SUnit*> Schedule; + Schedule.reserve(SUnits.size()); + + initNumPreds(SUnits); + + int StepNo = 0; + + for (auto SU : TopRoots) { + RQ.push_back(*new (Alloc.Allocate()) Candidate(SU, StepNo)); + } + releaseSuccessors(&DAG.EntrySU, StepNo); + + while (!RQ.empty()) { + DEBUG( + dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n" + "Ready queue:"; + for (auto &C : RQ) + dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')'; + dbgs() << '\n'; + ); + + auto C = pickCandidate(); + assert(C); + RQ.remove(*C); + auto SU = C->SU; + DEBUG(dbgs() << "Selected "; SU->dump(&DAG)); + + releaseSuccessors(SU, StepNo); + Schedule.push_back(SU); + setIsScheduled(SU); + + if (getReadySuccessors(SU) == 0) + bumpPredsPriority(SU, StepNo); + + ++StepNo; + } + assert(SUnits.size() == Schedule.size()); + + return Schedule; +} + +namespace llvm { +std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots, + const ScheduleDAG &DAG) { + GCNMinRegScheduler S; + return S.schedule(TopRoots, DAG); +} +} diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp new file mode 100644 index 0000000..1d02c7f --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -0,0 +1,492 @@ +//===------------------------- GCNRegPressure.cpp - -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "GCNRegPressure.h" +#include "llvm/CodeGen/RegisterPressure.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-scheduler" + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +void llvm::printLivesAt(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + dbgs() << "Live regs at " << SI << ": " + << *LIS.getInstructionFromIndex(SI); + unsigned Num = 0; + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + const unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + if (!LIS.hasInterval(Reg)) + continue; + const auto &LI = LIS.getInterval(Reg); + if (LI.hasSubRanges()) { + bool firstTime = true; + for (const auto &S : LI.subranges()) { + if (!S.liveAt(SI)) continue; + if (firstTime) { + dbgs() << " " << PrintReg(Reg, MRI.getTargetRegisterInfo()) + << '\n'; + firstTime = false; + } + dbgs() << " " << S << '\n'; + ++Num; + } + } else if (LI.liveAt(SI)) { + dbgs() << " " << LI << '\n'; + ++Num; + } + } + if (!Num) dbgs() << " <none>\n"; +} + +static bool isEqual(const GCNRPTracker::LiveRegSet &S1, + const GCNRPTracker::LiveRegSet &S2) { + if (S1.size() != S2.size()) + return false; + + for (const auto &P : S1) { + auto I = S2.find(P.first); + if (I == S2.end() || I->second != P.second) + return false; + } + return true; +} + +#endif + +/////////////////////////////////////////////////////////////////////////////// +// GCNRegPressure + +unsigned GCNRegPressure::getRegKind(unsigned Reg, + const MachineRegisterInfo &MRI) { + assert(TargetRegisterInfo::isVirtualRegister(Reg)); + const auto RC = MRI.getRegClass(Reg); + auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); + return STI->isSGPRClass(RC) ? + (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) : + (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); +} + +void GCNRegPressure::inc(unsigned Reg, + LaneBitmask PrevMask, + LaneBitmask NewMask, + const MachineRegisterInfo &MRI) { + if (NewMask == PrevMask) + return; + + int Sign = 1; + if (NewMask < PrevMask) { + std::swap(NewMask, PrevMask); + Sign = -1; + } +#ifndef NDEBUG + const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg); +#endif + switch (auto Kind = getRegKind(Reg, MRI)) { + case SGPR32: + case VGPR32: + assert(PrevMask.none() && NewMask == MaxMask); + Value[Kind] += Sign; + break; + + case SGPR_TUPLE: + case VGPR_TUPLE: + assert(NewMask < MaxMask || NewMask == MaxMask); + assert(PrevMask < NewMask); + + Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] += + Sign * countPopulation((~PrevMask & NewMask).getAsInteger()); + + if (PrevMask.none()) { + assert(NewMask.any()); + Value[Kind] += Sign * MRI.getPressureSets(Reg).getWeight(); + } + break; + + default: llvm_unreachable("Unknown register kind"); + } +} + +bool GCNRegPressure::less(const SISubtarget &ST, + const GCNRegPressure& O, + unsigned MaxOccupancy) const { + const auto SGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumSGPRs(getSGPRNum())); + const auto VGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumVGPRs(getVGPRNum())); + const auto OtherSGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumSGPRs(O.getSGPRNum())); + const auto OtherVGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumVGPRs(O.getVGPRNum())); + + const auto Occ = std::min(SGPROcc, VGPROcc); + const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc); + if (Occ != OtherOcc) + return Occ > OtherOcc; + + bool SGPRImportant = SGPROcc < VGPROcc; + const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc; + + // if both pressures disagree on what is more important compare vgprs + if (SGPRImportant != OtherSGPRImportant) { + SGPRImportant = false; + } + + // compare large regs pressure + bool SGPRFirst = SGPRImportant; + for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) { + if (SGPRFirst) { + auto SW = getSGPRTuplesWeight(); + auto OtherSW = O.getSGPRTuplesWeight(); + if (SW != OtherSW) + return SW < OtherSW; + } else { + auto VW = getVGPRTuplesWeight(); + auto OtherVW = O.getVGPRTuplesWeight(); + if (VW != OtherVW) + return VW < OtherVW; + } + } + return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()): + (getVGPRNum() < O.getVGPRNum()); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const { + OS << "VGPRs: " << getVGPRNum(); + if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')'; + OS << ", SGPRs: " << getSGPRNum(); + if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')'; + OS << ", LVGPR WT: " << getVGPRTuplesWeight() + << ", LSGPR WT: " << getSGPRTuplesWeight(); + if (ST) OS << " -> Occ: " << getOccupancy(*ST); + OS << '\n'; +} +#endif + + +static LaneBitmask getDefRegMask(const MachineOperand &MO, + const MachineRegisterInfo &MRI) { + assert(MO.isDef() && MO.isReg() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())); + + // We don't rely on read-undef flag because in case of tentative schedule + // tracking it isn't set correctly yet. This works correctly however since + // use mask has been tracked before using LIS. + return MO.getSubReg() == 0 ? + MRI.getMaxLaneMaskForVReg(MO.getReg()) : + MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg()); +} + +static LaneBitmask getUsedRegMask(const MachineOperand &MO, + const MachineRegisterInfo &MRI, + const LiveIntervals &LIS) { + assert(MO.isUse() && MO.isReg() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())); + + if (auto SubReg = MO.getSubReg()) + return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); + + auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg()); + if (MaxMask.getAsInteger() == 1) // cannot have subregs + return MaxMask; + + // For a tentative schedule LIS isn't updated yet but livemask should remain + // the same on any schedule. Subreg defs can be reordered but they all must + // dominate uses anyway. + auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex(); + return getLiveLaneMask(MO.getReg(), SI, LIS, MRI); +} + +static SmallVector<RegisterMaskPair, 8> +collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + SmallVector<RegisterMaskPair, 8> Res; + for (const auto &MO : MI.operands()) { + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + if (!MO.isUse() || !MO.readsReg()) + continue; + + auto const UsedMask = getUsedRegMask(MO, MRI, LIS); + + auto Reg = MO.getReg(); + auto I = std::find_if(Res.begin(), Res.end(), [Reg](const RegisterMaskPair &RM) { + return RM.RegUnit == Reg; + }); + if (I != Res.end()) + I->LaneMask |= UsedMask; + else + Res.push_back(RegisterMaskPair(Reg, UsedMask)); + } + return Res; +} + +/////////////////////////////////////////////////////////////////////////////// +// GCNRPTracker + +LaneBitmask llvm::getLiveLaneMask(unsigned Reg, + SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + LaneBitmask LiveMask; + const auto &LI = LIS.getInterval(Reg); + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) + if (S.liveAt(SI)) { + LiveMask |= S.LaneMask; + assert(LiveMask < MRI.getMaxLaneMaskForVReg(Reg) || + LiveMask == MRI.getMaxLaneMaskForVReg(Reg)); + } + } else if (LI.liveAt(SI)) { + LiveMask = MRI.getMaxLaneMaskForVReg(Reg); + } + return LiveMask; +} + +GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + GCNRPTracker::LiveRegSet LiveRegs; + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + auto Reg = TargetRegisterInfo::index2VirtReg(I); + if (!LIS.hasInterval(Reg)) + continue; + auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI); + if (LiveMask.any()) + LiveRegs[Reg] = LiveMask; + } + return LiveRegs; +} + +void GCNUpwardRPTracker::reset(const MachineInstr &MI, + const LiveRegSet *LiveRegsCopy) { + MRI = &MI.getParent()->getParent()->getRegInfo(); + if (LiveRegsCopy) { + if (&LiveRegs != LiveRegsCopy) + LiveRegs = *LiveRegsCopy; + } else { + LiveRegs = getLiveRegsAfter(MI, LIS); + } + MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); +} + +void GCNUpwardRPTracker::recede(const MachineInstr &MI) { + assert(MRI && "call reset first"); + + LastTrackedMI = &MI; + + if (MI.isDebugValue()) + return; + + auto const RegUses = collectVirtualRegUses(MI, LIS, *MRI); + + // calc pressure at the MI (defs + uses) + auto AtMIPressure = CurPressure; + for (const auto &U : RegUses) { + auto LiveMask = LiveRegs[U.RegUnit]; + AtMIPressure.inc(U.RegUnit, LiveMask, LiveMask | U.LaneMask, *MRI); + } + // update max pressure + MaxPressure = max(AtMIPressure, MaxPressure); + + for (const auto &MO : MI.defs()) { + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()) || + MO.isDead()) + continue; + + auto Reg = MO.getReg(); + auto I = LiveRegs.find(Reg); + if (I == LiveRegs.end()) + continue; + auto &LiveMask = I->second; + auto PrevMask = LiveMask; + LiveMask &= ~getDefRegMask(MO, *MRI); + CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); + if (LiveMask.none()) + LiveRegs.erase(I); + } + for (const auto &U : RegUses) { + auto &LiveMask = LiveRegs[U.RegUnit]; + auto PrevMask = LiveMask; + LiveMask |= U.LaneMask; + CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI); + } + assert(CurPressure == getRegPressure(*MRI, LiveRegs)); +} + +bool GCNDownwardRPTracker::reset(const MachineInstr &MI, + const LiveRegSet *LiveRegsCopy) { + MRI = &MI.getParent()->getParent()->getRegInfo(); + LastTrackedMI = nullptr; + MBBEnd = MI.getParent()->end(); + NextMI = &MI; + NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); + if (NextMI == MBBEnd) + return false; + if (LiveRegsCopy) { + if (&LiveRegs != LiveRegsCopy) + LiveRegs = *LiveRegsCopy; + } else { + LiveRegs = getLiveRegsBefore(*NextMI, LIS); + } + MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); + return true; +} + +bool GCNDownwardRPTracker::advanceBeforeNext() { + assert(MRI && "call reset first"); + + NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); + if (NextMI == MBBEnd) + return false; + + SlotIndex SI = LIS.getInstructionIndex(*NextMI).getBaseIndex(); + assert(SI.isValid()); + + // Remove dead registers or mask bits. + for (auto &It : LiveRegs) { + const LiveInterval &LI = LIS.getInterval(It.first); + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) { + if (!S.liveAt(SI)) { + auto PrevMask = It.second; + It.second &= ~S.LaneMask; + CurPressure.inc(It.first, PrevMask, It.second, *MRI); + } + } + } else if (!LI.liveAt(SI)) { + auto PrevMask = It.second; + It.second = LaneBitmask::getNone(); + CurPressure.inc(It.first, PrevMask, It.second, *MRI); + } + if (It.second.none()) + LiveRegs.erase(It.first); + } + + MaxPressure = max(MaxPressure, CurPressure); + + return true; +} + +void GCNDownwardRPTracker::advanceToNext() { + LastTrackedMI = &*NextMI++; + + // Add new registers or mask bits. + for (const auto &MO : LastTrackedMI->defs()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + auto &LiveMask = LiveRegs[Reg]; + auto PrevMask = LiveMask; + LiveMask |= getDefRegMask(MO, *MRI); + CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); + } + + MaxPressure = max(MaxPressure, CurPressure); +} + +bool GCNDownwardRPTracker::advance() { + // If we have just called reset live set is actual. + if ((NextMI == MBBEnd) || (LastTrackedMI && !advanceBeforeNext())) + return false; + advanceToNext(); + return true; +} + +bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator End) { + while (NextMI != End) + if (!advance()) return false; + return true; +} + +bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator Begin, + MachineBasicBlock::const_iterator End, + const LiveRegSet *LiveRegsCopy) { + reset(*Begin, LiveRegsCopy); + return advance(End); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, + const GCNRPTracker::LiveRegSet &TrackedLR, + const TargetRegisterInfo *TRI) { + for (auto const &P : TrackedLR) { + auto I = LISLR.find(P.first); + if (I == LISLR.end()) { + dbgs() << " " << PrintReg(P.first, TRI) + << ":L" << PrintLaneMask(P.second) + << " isn't found in LIS reported set\n"; + } + else if (I->second != P.second) { + dbgs() << " " << PrintReg(P.first, TRI) + << " masks doesn't match: LIS reported " + << PrintLaneMask(I->second) + << ", tracked " + << PrintLaneMask(P.second) + << '\n'; + } + } + for (auto const &P : LISLR) { + auto I = TrackedLR.find(P.first); + if (I == TrackedLR.end()) { + dbgs() << " " << PrintReg(P.first, TRI) + << ":L" << PrintLaneMask(P.second) + << " isn't found in tracked set\n"; + } + } +} + +bool GCNUpwardRPTracker::isValid() const { + const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex(); + const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI); + const auto &TrackedLR = LiveRegs; + + if (!isEqual(LISLR, TrackedLR)) { + dbgs() << "\nGCNUpwardRPTracker error: Tracked and" + " LIS reported livesets mismatch:\n"; + printLivesAt(SI, LIS, *MRI); + reportMismatch(LISLR, TrackedLR, MRI->getTargetRegisterInfo()); + return false; + } + + auto LISPressure = getRegPressure(*MRI, LISLR); + if (LISPressure != CurPressure) { + dbgs() << "GCNUpwardRPTracker error: Pressure sets different\nTracked: "; + CurPressure.print(dbgs()); + dbgs() << "LIS rpt: "; + LISPressure.print(dbgs()); + return false; + } + return true; +} + +void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs, + const MachineRegisterInfo &MRI) { + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + auto It = LiveRegs.find(Reg); + if (It != LiveRegs.end() && It->second.any()) + OS << ' ' << PrintVRegOrUnit(Reg, TRI) << ':' + << PrintLaneMask(It->second); + } + OS << '\n'; +} +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.h new file mode 100644 index 0000000..5dfe440 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -0,0 +1,207 @@ +//===---------------------- GCNRegPressure.h -*- C++ -*--------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H +#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H + +#include "AMDGPUSubtarget.h" + +#include <limits> + +namespace llvm { + +struct GCNRegPressure { + enum RegKind { + SGPR32, + SGPR_TUPLE, + VGPR32, + VGPR_TUPLE, + TOTAL_KINDS + }; + + GCNRegPressure() { + clear(); + } + + bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; } + + void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } + + unsigned getSGPRNum() const { return Value[SGPR32]; } + unsigned getVGPRNum() const { return Value[VGPR32]; } + + unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; } + unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; } + + unsigned getOccupancy(const SISubtarget &ST) const { + return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()), + ST.getOccupancyWithNumVGPRs(getVGPRNum())); + } + + void inc(unsigned Reg, + LaneBitmask PrevMask, + LaneBitmask NewMask, + const MachineRegisterInfo &MRI); + + bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const { + return getOccupancy(ST) > O.getOccupancy(ST); + } + + bool less(const SISubtarget &ST, const GCNRegPressure& O, + unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const; + + bool operator==(const GCNRegPressure &O) const { + return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value); + } + + bool operator!=(const GCNRegPressure &O) const { + return !(*this == O); + } + + void print(raw_ostream &OS, const SISubtarget *ST=nullptr) const; + void dump() const { print(dbgs()); } + +private: + unsigned Value[TOTAL_KINDS]; + + static unsigned getRegKind(unsigned Reg, const MachineRegisterInfo &MRI); + + friend GCNRegPressure max(const GCNRegPressure &P1, + const GCNRegPressure &P2); +}; + +inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) { + GCNRegPressure Res; + for (unsigned I = 0; I < GCNRegPressure::TOTAL_KINDS; ++I) + Res.Value[I] = std::max(P1.Value[I], P2.Value[I]); + return Res; +} + +class GCNRPTracker { +public: + typedef DenseMap<unsigned, LaneBitmask> LiveRegSet; + +protected: + const LiveIntervals &LIS; + LiveRegSet LiveRegs; + GCNRegPressure CurPressure, MaxPressure; + const MachineInstr *LastTrackedMI = nullptr; + mutable const MachineRegisterInfo *MRI = nullptr; + GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {} +public: + // live regs for the current state + const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } + const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; } + + void clearMaxPressure() { MaxPressure.clear(); } + + // returns MaxPressure, resetting it + decltype(MaxPressure) moveMaxPressure() { + auto Res = MaxPressure; + MaxPressure.clear(); + return Res; + } + decltype(LiveRegs) moveLiveRegs() { + return std::move(LiveRegs); + } + static void printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs, + const MachineRegisterInfo &MRI); +}; + +class GCNUpwardRPTracker : public GCNRPTracker { +public: + GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} + // reset tracker to the point just below MI + // filling live regs upon this point using LIS + void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr); + + // move to the state just above the MI + void recede(const MachineInstr &MI); + + // checks whether the tracker's state after receding MI corresponds + // to reported by LIS + bool isValid() const; +}; + +class GCNDownwardRPTracker : public GCNRPTracker { + // Last position of reset or advanceBeforeNext + MachineBasicBlock::const_iterator NextMI; + + MachineBasicBlock::const_iterator MBBEnd; + +public: + GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} + + const MachineBasicBlock::const_iterator getNext() const { return NextMI; } + + // Reset tracker to the point before the MI + // filling live regs upon this point using LIS. + // Returns false if block is empty except debug values. + bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr); + + // Move to the state right before the next MI. Returns false if reached + // end of the block. + bool advanceBeforeNext(); + + // Move to the state at the MI, advanceBeforeNext has to be called first. + void advanceToNext(); + + // Move to the state at the next MI. Returns false if reached end of block. + bool advance(); + + // Advance instructions until before End. + bool advance(MachineBasicBlock::const_iterator End); + + // Reset to Begin and advance to End. + bool advance(MachineBasicBlock::const_iterator Begin, + MachineBasicBlock::const_iterator End, + const LiveRegSet *LiveRegsCopy = nullptr); +}; + +LaneBitmask getLiveLaneMask(unsigned Reg, + SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); + +GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); + +inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI, + const LiveIntervals &LIS) { + return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS, + MI.getParent()->getParent()->getRegInfo()); +} + +inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, + const LiveIntervals &LIS) { + return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS, + MI.getParent()->getParent()->getRegInfo()); +} + +template <typename Range> +GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, + Range &&LiveRegs) { + GCNRegPressure Res; + for (const auto &RM : LiveRegs) + Res.inc(RM.first, LaneBitmask::getNone(), RM.second, MRI); + return Res; +} + +void printLivesAt(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); + +} // End namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 2f88033..155b400 100644 --- a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -18,14 +18,15 @@ #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/Support/MathExtras.h" -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" using namespace llvm; GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( const MachineSchedContext *C) : - GenericScheduler(C) { } + GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { } static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs, const MachineFunction &MF) { @@ -35,18 +36,46 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs, unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs), ST.getOccupancyWithNumVGPRs(VGPRs)); return std::min(MinRegOccupancy, - ST.getOccupancyWithLocalMemSize(MFI->getLDSSize())); + ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), + *MF.getFunction())); +} + +void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { + GenericScheduler::initialize(DAG); + + const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); + + MF = &DAG->MF; + + const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + + // FIXME: This is also necessary, because some passes that run after + // scheduling and before regalloc increase register pressure. + const int ErrorMargin = 3; + + SGPRExcessLimit = Context->RegClassInfo + ->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass) - ErrorMargin; + VGPRExcessLimit = Context->RegClassInfo + ->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass) - ErrorMargin; + if (TargetOccupancy) { + SGPRCriticalLimit = ST.getMaxNumSGPRs(TargetOccupancy, true); + VGPRCriticalLimit = ST.getMaxNumVGPRs(TargetOccupancy); + } else { + SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF, + SRI->getSGPRPressureSet()); + VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF, + SRI->getVGPRPressureSet()); + } + + SGPRCriticalLimit -= ErrorMargin; + VGPRCriticalLimit -= ErrorMargin; } void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, - int SGPRPressure, - int VGPRPressure, - int SGPRExcessLimit, - int VGPRExcessLimit, - int SGPRCriticalLimit, - int VGPRCriticalLimit) { + unsigned SGPRPressure, + unsigned VGPRPressure) { Cand.SU = SU; Cand.AtTop = AtTop; @@ -66,8 +95,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); } - int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()]; - int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()]; + unsigned NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()]; + unsigned NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()]; // If two instructions increase the pressure of different register sets // by the same amount, the generic scheduler will prefer to schedule the @@ -77,7 +106,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // only for VGPRs or only for SGPRs. // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. - const int MaxVGPRPressureInc = 16; + const unsigned MaxVGPRPressureInc = 16; bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; @@ -86,11 +115,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // to increase the likelihood we don't go over the limits. We should improve // the analysis to look through dependencies to find the path with the least // register pressure. - // FIXME: This is also necessary, because some passes that run after - // scheduling and before regalloc increase register pressure. - const int ErrorMargin = 3; - VGPRExcessLimit -= ErrorMargin; - SGPRExcessLimit -= ErrorMargin; // We only need to update the RPDelata for instructions that increase // register pressure. Instructions that decrease or keep reg pressure @@ -103,7 +127,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet()); - Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure = SGPRExcessLimit); + Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); } // Register pressure is considered 'CRITICAL' if it is approaching a value @@ -111,9 +135,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both // has the same cost, so we don't need to prefer one over the other. - VGPRCriticalLimit -= ErrorMargin; - SGPRCriticalLimit -= ErrorMargin; - int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; @@ -134,27 +155,16 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand) { - const SISubtarget &ST = DAG->MF.getSubtarget<SISubtarget>(); const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()]; unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()]; - unsigned SGPRExcessLimit = - Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); - unsigned VGPRExcessLimit = - Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); - unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF); - unsigned SGPRCriticalLimit = SRI->getMaxNumSGPRs(ST, MaxWaves, true); - unsigned VGPRCriticalLimit = SRI->getMaxNumVGPRs(MaxWaves); - ReadyQueue &Q = Zone.Available; for (SUnit *SU : Q) { SchedCandidate TryCand(ZonePolicy); initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, - SGPRPressure, VGPRPressure, - SGPRExcessLimit, VGPRExcessLimit, - SGPRCriticalLimit, VGPRCriticalLimit); + SGPRPressure, VGPRPressure); // Pass SchedBoundary only when comparing nodes from the same boundary. SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg); @@ -167,16 +177,6 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, } } -static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) { - switch (Reason) { - default: - return Reason; - case GenericSchedulerBase::RegCritical: - case GenericSchedulerBase::RegExcess: - return -Reason; - } -} - // This function is mostly cut and pasted from // GenericScheduler::pickNodeBidirectional() SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { @@ -224,9 +224,9 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { // Pick best from BotCand and TopCand. DEBUG( dbgs() << "Top Cand: "; - traceCandidate(BotCand); - dbgs() << "Bot Cand: "; traceCandidate(TopCand); + dbgs() << "Bot Cand: "; + traceCandidate(BotCand); ); SchedCandidate Cand; if (TopCand.Reason == BotCand.Reason) { @@ -249,9 +249,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) { Cand = BotCand; } else { - int TopRank = getBidirectionalReasonRank(TopCand.Reason); - int BotRank = getBidirectionalReasonRank(BotCand.Reason); - if (TopRank > BotRank) { + if (BotCand.Reason > TopCand.Reason) { Cand = TopCand; } else { Cand = BotCand; @@ -310,3 +308,242 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); return SU; } + +GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C, + std::unique_ptr<MachineSchedStrategy> S) : + ScheduleDAGMILive(C, std::move(S)), + ST(MF.getSubtarget<SISubtarget>()), + MFI(*MF.getInfo<SIMachineFunctionInfo>()), + StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(), + *MF.getFunction())), + MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) { + + DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); +} + +void GCNScheduleDAGMILive::schedule() { + if (Stage == 0) { + // Just record regions at the first pass. + Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); + return; + } + + std::vector<MachineInstr*> Unsched; + Unsched.reserve(NumRegionInstrs); + for (auto &I : *this) + Unsched.push_back(&I); + + GCNRegPressure PressureBefore; + if (LIS) { + PressureBefore = Pressure[RegionIdx]; + + DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:"; + GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI); + dbgs() << "Region live-in pressure: "; + llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs()); + dbgs() << "Region register pressure: "; + PressureBefore.print(dbgs())); + } + + ScheduleDAGMILive::schedule(); + Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); + + if (!LIS) + return; + + // Check the results of scheduling. + GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; + auto PressureAfter = getRealRegPressure(); + + DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs())); + + if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && + PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) { + Pressure[RegionIdx] = PressureAfter; + DEBUG(dbgs() << "Pressure in desired limits, done.\n"); + return; + } + unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(), + PressureAfter.getVGPRNum(), MF); + unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(), + PressureBefore.getVGPRNum(), MF); + DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore << + ", after " << WavesAfter << ".\n"); + + // We could not keep current target occupancy because of the just scheduled + // region. Record new occupancy for next scheduling cycle. + unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); + if (NewOccupancy < MinOccupancy) { + MinOccupancy = NewOccupancy; + DEBUG(dbgs() << "Occupancy lowered for the function to " + << MinOccupancy << ".\n"); + } + + if (WavesAfter >= WavesBefore) { + Pressure[RegionIdx] = PressureAfter; + return; + } + + DEBUG(dbgs() << "Attempting to revert scheduling.\n"); + RegionEnd = RegionBegin; + for (MachineInstr *MI : Unsched) { + if (MI->getIterator() != RegionEnd) { + BB->remove(MI); + BB->insert(RegionEnd, MI); + LIS->handleMove(*MI, true); + } + // Reset read-undef flags and update them later. + for (auto &Op : MI->operands()) + if (Op.isReg() && Op.isDef()) + Op.setIsUndef(false); + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); + if (ShouldTrackLaneMasks) { + // Adjust liveness and add missing dead+read-undef flags. + SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); + } else { + // Adjust for missing dead-def flags. + RegOpers.detectDeadDefs(*MI, *LIS); + } + RegionEnd = MI->getIterator(); + ++RegionEnd; + DEBUG(dbgs() << "Scheduling " << *MI); + } + RegionBegin = Unsched.front()->getIterator(); + Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); + + placeDebugValues(); +} + +GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const { + GCNDownwardRPTracker RPTracker(*LIS); + RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); + return RPTracker.moveMaxPressure(); +} + +void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { + GCNDownwardRPTracker RPTracker(*LIS); + + // If the block has the only successor then live-ins of that successor are + // live-outs of the current block. We can reuse calculated live set if the + // successor will be sent to scheduling past current block. + const MachineBasicBlock *OnlySucc = nullptr; + if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) { + SlotIndexes *Ind = LIS->getSlotIndexes(); + if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin())) + OnlySucc = *MBB->succ_begin(); + } + + // Scheduler sends regions from the end of the block upwards. + size_t CurRegion = RegionIdx; + for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) + if (Regions[CurRegion].first->getParent() != MBB) + break; + --CurRegion; + + auto I = MBB->begin(); + auto LiveInIt = MBBLiveIns.find(MBB); + if (LiveInIt != MBBLiveIns.end()) { + auto LiveIn = std::move(LiveInIt->second); + RPTracker.reset(*MBB->begin(), &LiveIn); + MBBLiveIns.erase(LiveInIt); + } else { + I = Regions[CurRegion].first; + RPTracker.reset(*I); + } + + for ( ; ; ) { + I = RPTracker.getNext(); + + if (Regions[CurRegion].first == I) { + LiveIns[CurRegion] = RPTracker.getLiveRegs(); + RPTracker.clearMaxPressure(); + } + + if (Regions[CurRegion].second == I) { + Pressure[CurRegion] = RPTracker.moveMaxPressure(); + if (CurRegion-- == RegionIdx) + break; + } + RPTracker.advanceToNext(); + RPTracker.advanceBeforeNext(); + } + + if (OnlySucc) { + if (I != MBB->end()) { + RPTracker.advanceToNext(); + RPTracker.advance(MBB->end()); + } + RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs()); + RPTracker.advanceBeforeNext(); + MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); + } +} + +void GCNScheduleDAGMILive::finalizeSchedule() { + GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; + DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); + + LiveIns.resize(Regions.size()); + Pressure.resize(Regions.size()); + + do { + Stage++; + RegionIdx = 0; + MachineBasicBlock *MBB = nullptr; + + if (Stage > 1) { + // Retry function scheduling if we found resulting occupancy and it is + // lower than used for first pass scheduling. This will give more freedom + // to schedule low register pressure blocks. + // Code is partially copied from MachineSchedulerBase::scheduleRegions(). + + if (!LIS || StartingOccupancy <= MinOccupancy) + break; + + DEBUG(dbgs() + << "Retrying function scheduling with lowest recorded occupancy " + << MinOccupancy << ".\n"); + + S.setTargetOccupancy(MinOccupancy); + } + + for (auto Region : Regions) { + RegionBegin = Region.first; + RegionEnd = Region.second; + + if (RegionBegin->getParent() != MBB) { + if (MBB) finishBlock(); + MBB = RegionBegin->getParent(); + startBlock(MBB); + if (Stage == 1) + computeBlockPressure(MBB); + } + + unsigned NumRegionInstrs = std::distance(begin(), end()); + enterRegion(MBB, begin(), end(), NumRegionInstrs); + + // Skip empty scheduling regions (0 or 1 schedulable instructions). + if (begin() == end() || begin() == std::prev(end())) { + exitRegion(); + continue; + } + + DEBUG(dbgs() << "********** MI Scheduling **********\n"); + DEBUG(dbgs() << MF.getName() + << ":BB#" << MBB->getNumber() << " " << MBB->getName() + << "\n From: " << *begin() << " To: "; + if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; + else dbgs() << "End"; + dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); + + schedule(); + + exitRegion(); + ++RegionIdx; + } + finishBlock(); + + } while (Stage < 2); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 4cfc0ce..060d2ca 100644 --- a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -14,17 +14,21 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H +#include "GCNRegPressure.h" #include "llvm/CodeGen/MachineScheduler.h" namespace llvm { +class SIMachineFunctionInfo; class SIRegisterInfo; +class SISubtarget; /// This is a minimal scheduler strategy. The main difference between this /// and the GenericScheduler is that GCNSchedStrategy uses different /// heuristics to determine excess/critical pressure sets. Its goal is to /// maximize kernel occupancy (i.e. maximum number of waves per simd). class GCNMaxOccupancySchedStrategy : public GenericScheduler { + friend class GCNScheduleDAGMILive; SUnit *pickNodeBidirectional(bool &IsTopNode); @@ -35,18 +39,72 @@ class GCNMaxOccupancySchedStrategy : public GenericScheduler { void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, - int SGPRPressure, int VGPRPressure, - int SGPRExcessLimit, int VGPRExcessLimit, - int SGPRCriticalLimit, int VGPRCriticalLimit); + unsigned SGPRPressure, unsigned VGPRPressure); - void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, - SchedBoundary *Zone, const SIRegisterInfo *SRI, - unsigned SGPRPressure, unsigned VGPRPressure); + unsigned SGPRExcessLimit; + unsigned VGPRExcessLimit; + unsigned SGPRCriticalLimit; + unsigned VGPRCriticalLimit; + + unsigned TargetOccupancy; + + MachineFunction *MF; public: GCNMaxOccupancySchedStrategy(const MachineSchedContext *C); SUnit *pickNode(bool &IsTopNode) override; + + void initialize(ScheduleDAGMI *DAG) override; + + void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; } +}; + +class GCNScheduleDAGMILive : public ScheduleDAGMILive { + + const SISubtarget &ST; + + const SIMachineFunctionInfo &MFI; + + // Occupancy target at the beginning of function scheduling cycle. + unsigned StartingOccupancy; + + // Minimal real occupancy recorder for the function. + unsigned MinOccupancy; + + // Scheduling stage number. + unsigned Stage; + + // Current region index. + size_t RegionIdx; + + // Vecor of regions recorder for later rescheduling + SmallVector<std::pair<MachineBasicBlock::iterator, + MachineBasicBlock::iterator>, 32> Regions; + + // Region live-in cache. + SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns; + + // Region pressure cache. + SmallVector<GCNRegPressure, 32> Pressure; + + // Temporary basic block live-in cache. + DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns; + + // Return current region pressure. + GCNRegPressure getRealRegPressure() const; + + // Compute and cache live-ins and pressure for all regions in block. + void computeBlockPressure(const MachineBasicBlock *MBB); + + +public: + GCNScheduleDAGMILive(MachineSchedContext *C, + std::unique_ptr<MachineSchedStrategy> S); + + void schedule() override; + + void finalizeSchedule() override; }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index 7172a0a..a844081 100644 --- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -9,8 +9,8 @@ //===----------------------------------------------------------------------===// #include "AMDGPUInstPrinter.h" -#include "SIDefines.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" #include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/MCExpr.h" @@ -72,6 +72,11 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); } +void AMDGPUInstPrinter::printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(static_cast<int16_t>(MI->getOperand(OpNo).getImm())); +} + void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -113,11 +118,21 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O) { uint16_t Imm = MI->getOperand(OpNo).getImm(); if (Imm != 0) { - O << " offset:"; + O << ((OpNo == 0)? "offset:" : " offset:"); printU16ImmDecOperand(MI, OpNo, O); } } +void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm != 0) { + O << ((OpNo == 0)? "offset:" : " offset:"); + printS16ImmDecOperand(MI, OpNo, O); + } +} + void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -216,6 +231,24 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo, O << " vm"; } +void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " dfmt:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " nfmt:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI) { switch (RegNo) { @@ -264,6 +297,11 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, case AMDGPU::FLAT_SCR_HI: O << "flat_scratch_hi"; return; + case AMDGPU::FP_REG: + case AMDGPU::SP_REG: + case AMDGPU::SCRATCH_WAVE_OFFSET_REG: + case AMDGPU::PRIVATE_RSRC_REG: + llvm_unreachable("pseudo-register should not ever be emitted"); default: break; } @@ -375,6 +413,13 @@ void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, O << formatHex(static_cast<uint64_t>(Imm)); } +void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint16_t Lo16 = static_cast<uint16_t>(Imm); + printImmediate16(Lo16, STI, O); +} + void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -489,6 +534,10 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_IMM_FP16: printImmediate16(Op.getImm(), STI, O); break; + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + printImmediateV216(Op.getImm(), STI, O); + break; case MCOI::OPERAND_UNKNOWN: case MCOI::OPERAND_PCREL: O << formatDec(Op.getImm()); @@ -531,13 +580,34 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned InputModifiers = MI->getOperand(OpNo).getImm(); - if (InputModifiers & SISrcMods::NEG) - O << '-'; + + // Use 'neg(...)' instead of '-' to avoid ambiguity. + // This is important for integer literals because + // -1 is not the same value as neg(1). + bool NegMnemo = false; + + if (InputModifiers & SISrcMods::NEG) { + if (OpNo + 1 < MI->getNumOperands() && + (InputModifiers & SISrcMods::ABS) == 0) { + const MCOperand &Op = MI->getOperand(OpNo + 1); + NegMnemo = Op.isImm() || Op.isFPImm(); + } + if (NegMnemo) { + O << "neg("; + } else { + O << '-'; + } + } + if (InputModifiers & SISrcMods::ABS) O << '|'; printOperand(MI, OpNo + 1, STI, O); if (InputModifiers & SISrcMods::ABS) O << '|'; + + if (NegMnemo) { + O << ')'; + } } void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, @@ -672,11 +742,19 @@ template <unsigned N> void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - int EnIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::en); + unsigned Opc = MI->getOpcode(); + int EnIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::en); unsigned En = MI->getOperand(EnIdx).getImm(); - // FIXME: What do we do with compr? The meaning of en changes depending on if - // compr is set. + int ComprIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::compr); + + // If compr is set, print as src0, src0, src1, src1 + if (MI->getOperand(ComprIdx).getImm()) { + if (N == 1 || N == 2) + --OpNo; + else if (N == 3) + OpNo -= 2; + } if (En & (1 << N)) printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI); @@ -730,6 +808,71 @@ void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo, } } +static bool allOpsDefaultValue(const int* Ops, int NumOps, int Mod) { + int DefaultValue = (Mod == SISrcMods::OP_SEL_1); + + for (int I = 0; I < NumOps; ++I) { + if (!!(Ops[I] & Mod) != DefaultValue) + return false; + } + + return true; +} + +static void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod, + raw_ostream &O) { + unsigned Opc = MI->getOpcode(); + int NumOps = 0; + int Ops[3]; + + for (int OpName : { AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers }) { + int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName); + if (Idx == -1) + break; + + Ops[NumOps++] = MI->getOperand(Idx).getImm(); + } + + if (allOpsDefaultValue(Ops, NumOps, Mod)) + return; + + O << Name; + for (int I = 0; I < NumOps; ++I) { + if (I != 0) + O << ','; + + O << !!(Ops[I] & Mod); + } + + O << ']'; +} + +void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O); +} + +void AMDGPUInstPrinter::printOpSelHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " op_sel_hi:[", SISrcMods::OP_SEL_1, O); +} + +void AMDGPUInstPrinter::printNegLo(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " neg_lo:[", SISrcMods::NEG, O); +} + +void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O); +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -1054,30 +1197,137 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, O << SImm16; // Unknown simm16 code. } +static void printSwizzleBitmask(const uint16_t AndMask, + const uint16_t OrMask, + const uint16_t XorMask, + raw_ostream &O) { + using namespace llvm::AMDGPU::Swizzle; + + uint16_t Probe0 = ((0 & AndMask) | OrMask) ^ XorMask; + uint16_t Probe1 = ((BITMASK_MASK & AndMask) | OrMask) ^ XorMask; + + O << "\""; + + for (unsigned Mask = 1 << (BITMASK_WIDTH - 1); Mask > 0; Mask >>= 1) { + uint16_t p0 = Probe0 & Mask; + uint16_t p1 = Probe1 & Mask; + + if (p0 == p1) { + if (p0 == 0) { + O << "0"; + } else { + O << "1"; + } + } else { + if (p0 == 0) { + O << "p"; + } else { + O << "i"; + } + } + } + + O << "\""; +} + +void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::Swizzle; + + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm == 0) { + return; + } + + O << " offset:"; + + if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) { + + O << "swizzle(" << IdSymbolic[ID_QUAD_PERM]; + for (auto i = 0; i < LANE_NUM; ++i) { + O << ","; + O << formatDec(Imm & LANE_MASK); + Imm >>= LANE_SHIFT; + } + O << ")"; + + } else if ((Imm & BITMASK_PERM_ENC_MASK) == BITMASK_PERM_ENC) { + + uint16_t AndMask = (Imm >> BITMASK_AND_SHIFT) & BITMASK_MASK; + uint16_t OrMask = (Imm >> BITMASK_OR_SHIFT) & BITMASK_MASK; + uint16_t XorMask = (Imm >> BITMASK_XOR_SHIFT) & BITMASK_MASK; + + if (AndMask == BITMASK_MAX && + OrMask == 0 && + countPopulation(XorMask) == 1) { + + O << "swizzle(" << IdSymbolic[ID_SWAP]; + O << ","; + O << formatDec(XorMask); + O << ")"; + + } else if (AndMask == BITMASK_MAX && + OrMask == 0 && XorMask > 0 && + isPowerOf2_64(XorMask + 1)) { + + O << "swizzle(" << IdSymbolic[ID_REVERSE]; + O << ","; + O << formatDec(XorMask + 1); + O << ")"; + + } else { + + uint16_t GroupSize = BITMASK_MAX - AndMask + 1; + if (GroupSize > 1 && + isPowerOf2_64(GroupSize) && + OrMask < GroupSize && + XorMask == 0) { + + O << "swizzle(" << IdSymbolic[ID_BROADCAST]; + O << ","; + O << formatDec(GroupSize); + O << ","; + O << formatDec(OrMask); + O << ")"; + + } else { + O << "swizzle(" << IdSymbolic[ID_BITMASK_PERM]; + O << ","; + printSwizzleBitmask(AndMask, OrMask, XorMask, O); + O << ")"; + } + } + } else { + printU16ImmDecOperand(MI, OpNo, O); + } +} + void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - IsaVersion IV = getIsaVersion(STI.getFeatureBits()); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(STI.getFeatureBits()); unsigned SImm16 = MI->getOperand(OpNo).getImm(); unsigned Vmcnt, Expcnt, Lgkmcnt; - decodeWaitcnt(IV, SImm16, Vmcnt, Expcnt, Lgkmcnt); + decodeWaitcnt(ISA, SImm16, Vmcnt, Expcnt, Lgkmcnt); bool NeedSpace = false; - if (Vmcnt != getVmcntBitMask(IV)) { + if (Vmcnt != getVmcntBitMask(ISA)) { O << "vmcnt(" << Vmcnt << ')'; NeedSpace = true; } - if (Expcnt != getExpcntBitMask(IV)) { + if (Expcnt != getExpcntBitMask(ISA)) { if (NeedSpace) O << ' '; O << "expcnt(" << Expcnt << ')'; NeedSpace = true; } - if (Lgkmcnt != getLgkmcntBitMask(IV)) { + if (Lgkmcnt != getLgkmcntBitMask(ISA)) { if (NeedSpace) O << ' '; O << "lgkmcnt(" << Lgkmcnt << ')'; diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index a6d348f..7bbf99a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -42,6 +42,7 @@ private: void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU32ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O, @@ -52,6 +53,9 @@ private: void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -84,12 +88,18 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printExpVM(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printDFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); + void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, @@ -117,6 +127,14 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printSDWADstUnused(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printOpSel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printOpSelHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNegLo(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNegHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printInterpAttr(const MCInst *MI, unsigned OpNo, @@ -183,6 +201,8 @@ private: raw_ostream &O); void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printWaitFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index ffb92aa..a50e3eb 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUFixupKinds.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" @@ -30,14 +30,9 @@ public: unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; }; - void processFixupValue(const MCAssembler &Asm, - const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override; - - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsResolved) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override { @@ -102,36 +97,11 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } } -void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm, - const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) { - MCValue Res; - - // When we have complex expressions like: BB0_1 + (BB0_2 - 4), which are - // used for long branches, this function will be called with - // IsResolved = false and Value set to some pre-computed value. In - // the example above, the value would be: - // (BB0_1 + (BB0_2 - 4)) - CurrentOffsetFromStartOfFunction. - // This is not what we want. We just want the expression computation - // only. The reason the MC layer subtracts the current offset from the - // expression is because the fixup is of kind FK_PCRel_4. - // For these scenarios, evaluateAsValue gives us the computation that we - // want. - if (!IsResolved && Fixup.getValue()->evaluateAsValue(Res, Layout) && - Res.isAbsolute()) { - Value = Res.getConstant(); - IsResolved = true; - - } - if (IsResolved) - Value = adjustFixupValue(Fixup, Value, &Asm.getContext()); -} - -void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel) const { +void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, uint64_t Value, + bool IsResolved) const { + Value = adjustFixupValue(Fixup, Value, &Asm.getContext()); if (!Value) return; // Doesn't change encoding. @@ -142,7 +112,7 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); uint32_t Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the bits from // the fixup value. @@ -164,7 +134,20 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( } bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { - OW->WriteZeros(Count); + // If the count is not 4-byte aligned, we must be writing data into the text + // section (otherwise we have unaligned instructions, and thus have far + // bigger problems), so just write zeros instead. + OW->WriteZeros(Count % 4); + + // We are properly aligned, so write NOPs as requested. + Count /= 4; + + // FIXME: R600 support. + // s_nop 0 + const uint32_t Encoded_S_NOP_0 = 0xbf800000; + + for (uint64_t I = 0; I != Count; ++I) + OW->write32(Encoded_S_NOP_0); return true; } diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp new file mode 100644 index 0000000..4e828a7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp @@ -0,0 +1,432 @@ +//===--- AMDGPUCodeObjectMetadataStreamer.cpp -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Code Object Metadata Streamer. +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUCodeObjectMetadataStreamer.h" +#include "AMDGPU.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +static cl::opt<bool> DumpCodeObjectMetadata( + "amdgpu-dump-comd", + cl::desc("Dump AMDGPU Code Object Metadata")); +static cl::opt<bool> VerifyCodeObjectMetadata( + "amdgpu-verify-comd", + cl::desc("Verify AMDGPU Code Object Metadata")); + +namespace AMDGPU { +namespace CodeObject { + +void MetadataStreamer::dump(StringRef YamlString) const { + errs() << "AMDGPU Code Object Metadata:\n" << YamlString << '\n'; +} + +void MetadataStreamer::verify(StringRef YamlString) const { + errs() << "AMDGPU Code Object Metadata Parser Test: "; + + CodeObject::Metadata FromYamlString; + if (Metadata::fromYamlString(YamlString, FromYamlString)) { + errs() << "FAIL\n"; + return; + } + + std::string ToYamlString; + if (Metadata::toYamlString(FromYamlString, ToYamlString)) { + errs() << "FAIL\n"; + return; + } + + errs() << (YamlString == ToYamlString ? "PASS" : "FAIL") << '\n'; + if (YamlString != ToYamlString) { + errs() << "Original input: " << YamlString << '\n' + << "Produced output: " << ToYamlString << '\n'; + } +} + +AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const { + if (AccQual.empty()) + return AccessQualifier::Unknown; + + return StringSwitch<AccessQualifier>(AccQual) + .Case("read_only", AccessQualifier::ReadOnly) + .Case("write_only", AccessQualifier::WriteOnly) + .Case("read_write", AccessQualifier::ReadWrite) + .Default(AccessQualifier::Default); +} + +AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer( + unsigned AddressSpace) const { + if (AddressSpace == AMDGPUASI.PRIVATE_ADDRESS) + return AddressSpaceQualifier::Private; + if (AddressSpace == AMDGPUASI.GLOBAL_ADDRESS) + return AddressSpaceQualifier::Global; + if (AddressSpace == AMDGPUASI.CONSTANT_ADDRESS) + return AddressSpaceQualifier::Constant; + if (AddressSpace == AMDGPUASI.LOCAL_ADDRESS) + return AddressSpaceQualifier::Local; + if (AddressSpace == AMDGPUASI.FLAT_ADDRESS) + return AddressSpaceQualifier::Generic; + if (AddressSpace == AMDGPUASI.REGION_ADDRESS) + return AddressSpaceQualifier::Region; + + llvm_unreachable("Unknown address space qualifier"); +} + +ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual, + StringRef BaseTypeName) const { + if (TypeQual.find("pipe") != StringRef::npos) + return ValueKind::Pipe; + + return StringSwitch<ValueKind>(BaseTypeName) + .Case("image1d_t", ValueKind::Image) + .Case("image1d_array_t", ValueKind::Image) + .Case("image1d_buffer_t", ValueKind::Image) + .Case("image2d_t", ValueKind::Image) + .Case("image2d_array_t", ValueKind::Image) + .Case("image2d_array_depth_t", ValueKind::Image) + .Case("image2d_array_msaa_t", ValueKind::Image) + .Case("image2d_array_msaa_depth_t", ValueKind::Image) + .Case("image2d_depth_t", ValueKind::Image) + .Case("image2d_msaa_t", ValueKind::Image) + .Case("image2d_msaa_depth_t", ValueKind::Image) + .Case("image3d_t", ValueKind::Image) + .Case("sampler_t", ValueKind::Sampler) + .Case("queue_t", ValueKind::Queue) + .Default(isa<PointerType>(Ty) ? + (Ty->getPointerAddressSpace() == + AMDGPUASI.LOCAL_ADDRESS ? + ValueKind::DynamicSharedPointer : + ValueKind::GlobalBuffer) : + ValueKind::ByValue); +} + +ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const { + switch (Ty->getTypeID()) { + case Type::IntegerTyID: { + auto Signed = !TypeName.startswith("u"); + switch (Ty->getIntegerBitWidth()) { + case 8: + return Signed ? ValueType::I8 : ValueType::U8; + case 16: + return Signed ? ValueType::I16 : ValueType::U16; + case 32: + return Signed ? ValueType::I32 : ValueType::U32; + case 64: + return Signed ? ValueType::I64 : ValueType::U64; + default: + return ValueType::Struct; + } + } + case Type::HalfTyID: + return ValueType::F16; + case Type::FloatTyID: + return ValueType::F32; + case Type::DoubleTyID: + return ValueType::F64; + case Type::PointerTyID: + return getValueType(Ty->getPointerElementType(), TypeName); + case Type::VectorTyID: + return getValueType(Ty->getVectorElementType(), TypeName); + default: + return ValueType::Struct; + } +} + +std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const { + switch (Ty->getTypeID()) { + case Type::IntegerTyID: { + if (!Signed) + return (Twine('u') + getTypeName(Ty, true)).str(); + + auto BitWidth = Ty->getIntegerBitWidth(); + switch (BitWidth) { + case 8: + return "char"; + case 16: + return "short"; + case 32: + return "int"; + case 64: + return "long"; + default: + return (Twine('i') + Twine(BitWidth)).str(); + } + } + case Type::HalfTyID: + return "half"; + case Type::FloatTyID: + return "float"; + case Type::DoubleTyID: + return "double"; + case Type::VectorTyID: { + auto VecTy = cast<VectorType>(Ty); + auto ElTy = VecTy->getElementType(); + auto NumElements = VecTy->getVectorNumElements(); + return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str(); + } + default: + return "unknown"; + } +} + +std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions( + MDNode *Node) const { + std::vector<uint32_t> Dims; + if (Node->getNumOperands() != 3) + return Dims; + + for (auto &Op : Node->operands()) + Dims.push_back(mdconst::extract<ConstantInt>(Op)->getZExtValue()); + return Dims; +} + +void MetadataStreamer::emitVersion() { + auto &Version = CodeObjectMetadata.mVersion; + + Version.push_back(MetadataVersionMajor); + Version.push_back(MetadataVersionMinor); +} + +void MetadataStreamer::emitPrintf(const Module &Mod) { + auto &Printf = CodeObjectMetadata.mPrintf; + + auto Node = Mod.getNamedMetadata("llvm.printf.fmts"); + if (!Node) + return; + + for (auto Op : Node->operands()) + if (Op->getNumOperands()) + Printf.push_back(cast<MDString>(Op->getOperand(0))->getString()); +} + +void MetadataStreamer::emitKernelLanguage(const Function &Func) { + auto &Kernel = CodeObjectMetadata.mKernels.back(); + + // TODO: What about other languages? + auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version"); + if (!Node || !Node->getNumOperands()) + return; + auto Op0 = Node->getOperand(0); + if (Op0->getNumOperands() <= 1) + return; + + Kernel.mLanguage = "OpenCL C"; + Kernel.mLanguageVersion.push_back( + mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue()); + Kernel.mLanguageVersion.push_back( + mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()); +} + +void MetadataStreamer::emitKernelAttrs(const Function &Func) { + auto &Attrs = CodeObjectMetadata.mKernels.back().mAttrs; + + if (auto Node = Func.getMetadata("reqd_work_group_size")) + Attrs.mReqdWorkGroupSize = getWorkGroupDimensions(Node); + if (auto Node = Func.getMetadata("work_group_size_hint")) + Attrs.mWorkGroupSizeHint = getWorkGroupDimensions(Node); + if (auto Node = Func.getMetadata("vec_type_hint")) { + Attrs.mVecTypeHint = getTypeName( + cast<ValueAsMetadata>(Node->getOperand(0))->getType(), + mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()); + } +} + +void MetadataStreamer::emitKernelArgs(const Function &Func) { + for (auto &Arg : Func.args()) + emitKernelArg(Arg); + + // TODO: What about other languages? + if (!Func.getParent()->getNamedMetadata("opencl.ocl.version")) + return; + + auto &DL = Func.getParent()->getDataLayout(); + auto Int64Ty = Type::getInt64Ty(Func.getContext()); + + emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX); + emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY); + emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ); + + if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts")) + return; + + auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), + AMDGPUASI.GLOBAL_ADDRESS); + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); +} + +void MetadataStreamer::emitKernelArg(const Argument &Arg) { + auto Func = Arg.getParent(); + auto ArgNo = Arg.getArgNo(); + const MDNode *Node; + + StringRef TypeQual; + Node = Func->getMetadata("kernel_arg_type_qual"); + if (Node && ArgNo < Node->getNumOperands()) + TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); + + StringRef BaseTypeName; + Node = Func->getMetadata("kernel_arg_base_type"); + if (Node && ArgNo < Node->getNumOperands()) + BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString(); + + StringRef AccQual; + if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() && + Arg.hasNoAliasAttr()) { + AccQual = "read_only"; + } else { + Node = Func->getMetadata("kernel_arg_access_qual"); + if (Node && ArgNo < Node->getNumOperands()) + AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); + } + + StringRef Name; + Node = Func->getMetadata("kernel_arg_name"); + if (Node && ArgNo < Node->getNumOperands()) + Name = cast<MDString>(Node->getOperand(ArgNo))->getString(); + + StringRef TypeName; + Node = Func->getMetadata("kernel_arg_type"); + if (Node && ArgNo < Node->getNumOperands()) + TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString(); + + emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(), + getValueKind(Arg.getType(), TypeQual, BaseTypeName), TypeQual, + BaseTypeName, AccQual, Name, TypeName); +} + +void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, + ValueKind ValueKind, StringRef TypeQual, + StringRef BaseTypeName, StringRef AccQual, + StringRef Name, StringRef TypeName) { + CodeObjectMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata()); + auto &Arg = CodeObjectMetadata.mKernels.back().mArgs.back(); + + Arg.mSize = DL.getTypeAllocSize(Ty); + Arg.mAlign = DL.getABITypeAlignment(Ty); + Arg.mValueKind = ValueKind; + Arg.mValueType = getValueType(Ty, BaseTypeName); + + if (auto PtrTy = dyn_cast<PointerType>(Ty)) { + auto ElTy = PtrTy->getElementType(); + if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS && ElTy->isSized()) + Arg.mPointeeAlign = DL.getABITypeAlignment(ElTy); + } + + Arg.mAccQual = getAccessQualifier(AccQual); + + if (auto PtrTy = dyn_cast<PointerType>(Ty)) + Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace()); + + SmallVector<StringRef, 1> SplitTypeQuals; + TypeQual.split(SplitTypeQuals, " ", -1, false); + for (StringRef Key : SplitTypeQuals) { + auto P = StringSwitch<bool*>(Key) + .Case("const", &Arg.mIsConst) + .Case("pipe", &Arg.mIsPipe) + .Case("restrict", &Arg.mIsRestrict) + .Case("volatile", &Arg.mIsVolatile) + .Default(nullptr); + if (P) + *P = true; + } + + Arg.mName = Name; + Arg.mTypeName = TypeName; +} + +void MetadataStreamer::emitKernelCodeProps( + const amd_kernel_code_t &KernelCode) { + auto &CodeProps = CodeObjectMetadata.mKernels.back().mCodeProps; + + CodeProps.mKernargSegmentSize = KernelCode.kernarg_segment_byte_size; + CodeProps.mWorkgroupGroupSegmentSize = + KernelCode.workgroup_group_segment_byte_size; + CodeProps.mWorkitemPrivateSegmentSize = + KernelCode.workitem_private_segment_byte_size; + CodeProps.mWavefrontNumSGPRs = KernelCode.wavefront_sgpr_count; + CodeProps.mWorkitemNumVGPRs = KernelCode.workitem_vgpr_count; + CodeProps.mKernargSegmentAlign = KernelCode.kernarg_segment_alignment; + CodeProps.mGroupSegmentAlign = KernelCode.group_segment_alignment; + CodeProps.mPrivateSegmentAlign = KernelCode.private_segment_alignment; + CodeProps.mWavefrontSize = KernelCode.wavefront_size; +} + +void MetadataStreamer::emitKernelDebugProps( + const amd_kernel_code_t &KernelCode) { + if (!(KernelCode.code_properties & AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED)) + return; + + auto &DebugProps = CodeObjectMetadata.mKernels.back().mDebugProps; + + // FIXME: Need to pass down debugger ABI version through features. This is ok + // for now because we only have one version. + DebugProps.mDebuggerABIVersion.push_back(1); + DebugProps.mDebuggerABIVersion.push_back(0); + DebugProps.mReservedNumVGPRs = KernelCode.reserved_vgpr_count; + DebugProps.mReservedFirstVGPR = KernelCode.reserved_vgpr_first; + DebugProps.mPrivateSegmentBufferSGPR = + KernelCode.debug_private_segment_buffer_sgpr; + DebugProps.mWavefrontPrivateSegmentOffsetSGPR = + KernelCode.debug_wavefront_private_segment_offset_sgpr; +} + +void MetadataStreamer::begin(const Module &Mod) { + AMDGPUASI = getAMDGPUAS(Mod); + emitVersion(); + emitPrintf(Mod); +} + +void MetadataStreamer::emitKernel(const Function &Func, + const amd_kernel_code_t &KernelCode) { + if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL) + return; + + CodeObjectMetadata.mKernels.push_back(Kernel::Metadata()); + auto &Kernel = CodeObjectMetadata.mKernels.back(); + + Kernel.mName = Func.getName(); + emitKernelLanguage(Func); + emitKernelAttrs(Func); + emitKernelArgs(Func); + emitKernelCodeProps(KernelCode); + emitKernelDebugProps(KernelCode); +} + +ErrorOr<std::string> MetadataStreamer::toYamlString() { + std::string YamlString; + if (auto Error = Metadata::toYamlString(CodeObjectMetadata, YamlString)) + return Error; + + if (DumpCodeObjectMetadata) + dump(YamlString); + if (VerifyCodeObjectMetadata) + verify(YamlString); + + return YamlString; +} + +ErrorOr<std::string> MetadataStreamer::toYamlString(StringRef YamlString) { + if (auto Error = Metadata::fromYamlString(YamlString, CodeObjectMetadata)) + return Error; + + return toYamlString(); +} + +} // end namespace CodeObject +} // end namespace AMDGPU +} // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h new file mode 100644 index 0000000..c668143 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h @@ -0,0 +1,99 @@ +//===--- AMDGPUCodeObjectMetadataStreamer.h ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Code Object Metadata Streamer. +/// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H + +#include "AMDGPU.h" +#include "AMDKernelCodeT.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/AMDGPUCodeObjectMetadata.h" +#include "llvm/Support/ErrorOr.h" + +namespace llvm { + +class Argument; +class DataLayout; +class Function; +class MDNode; +class Module; +class Type; + +namespace AMDGPU { +namespace CodeObject { + +class MetadataStreamer final { +private: + Metadata CodeObjectMetadata; + AMDGPUAS AMDGPUASI; + + void dump(StringRef YamlString) const; + + void verify(StringRef YamlString) const; + + AccessQualifier getAccessQualifier(StringRef AccQual) const; + + AddressSpaceQualifier getAddressSpaceQualifer(unsigned AddressSpace) const; + + ValueKind getValueKind(Type *Ty, StringRef TypeQual, + StringRef BaseTypeName) const; + + ValueType getValueType(Type *Ty, StringRef TypeName) const; + + std::string getTypeName(Type *Ty, bool Signed) const; + + std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const; + + void emitVersion(); + + void emitPrintf(const Module &Mod); + + void emitKernelLanguage(const Function &Func); + + void emitKernelAttrs(const Function &Func); + + void emitKernelArgs(const Function &Func); + + void emitKernelArg(const Argument &Arg); + + void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind, + StringRef TypeQual = "", StringRef BaseTypeName = "", + StringRef AccQual = "", StringRef Name = "", + StringRef TypeName = ""); + + void emitKernelCodeProps(const amd_kernel_code_t &KernelCode); + + void emitKernelDebugProps(const amd_kernel_code_t &KernelCode); + +public: + MetadataStreamer() = default; + ~MetadataStreamer() = default; + + void begin(const Module &Mod); + + void end() {} + + void emitKernel(const Function &Func, const amd_kernel_code_t &KernelCode); + + ErrorOr<std::string> toYamlString(); + + ErrorOr<std::string> toYamlString(StringRef YamlString); +}; + +} // end namespace CodeObject +} // end namespace AMDGPU +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 1847d7a..6abe7f3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -1,16 +1,20 @@ -//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==// +//===- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -/// \file //===----------------------------------------------------------------------===// #include "AMDGPUMCTargetDesc.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ErrorHandling.h" using namespace llvm; @@ -19,20 +23,21 @@ namespace { class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { public: AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend); + protected: unsigned getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; }; -} // End anonymous namespace +} // end anonymous namespace AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend) : MCELFObjectTargetWriter(Is64Bit, ELF::ELFOSABI_AMDGPU_HSA, ELF::EM_AMDGPU, - HasRelocationAddend) { } + HasRelocationAddend) {} unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, @@ -77,7 +82,6 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, llvm_unreachable("unhandled relocation type"); } - MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend, raw_pwrite_stream &OS) { diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 1655591..2364e7b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -14,6 +14,8 @@ using namespace llvm; AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { + CodePointerSize = (TT.getArch() == Triple::amdgcn) ? 8 : 4; + StackGrowsUp = true; HasSingleParameterDotFile = false; //===------------------------------------------------------------------===// MinInstAlignment = 4; diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index 3d3858a..1b06206 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -52,6 +52,18 @@ public: return 0; } + virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + + virtual unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + protected: uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; void verifyInstructionPredicates(const MCInst &MI, diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 548bad5..f80b5f3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -54,11 +54,17 @@ MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit, #define GET_REGINFO_ENUM #include "AMDGPUGenRegisterInfo.inc" +#undef GET_REGINFO_ENUM #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_OPERAND_ENUM #include "AMDGPUGenInstrInfo.inc" +#undef GET_INSTRINFO_OPERAND_ENUM +#undef GET_INSTRINFO_ENUM + #define GET_SUBTARGETINFO_ENUM #include "AMDGPUGenSubtargetInfo.inc" +#undef GET_SUBTARGETINFO_ENUM #endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp deleted file mode 100644 index 95387ad..0000000 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp +++ /dev/null @@ -1,408 +0,0 @@ -//===-- AMDGPURuntimeMD.cpp - Generates runtime metadata ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// Generates AMDGPU runtime metadata for YAML mapping. -// -//===----------------------------------------------------------------------===// -// - -#include "AMDGPU.h" -#include "AMDGPURuntimeMetadata.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/YAMLTraits.h" -#include <vector> -#include "AMDGPURuntimeMD.h" - -using namespace llvm; -using namespace ::AMDGPU::RuntimeMD; - -static cl::opt<bool> -DumpRuntimeMD("amdgpu-dump-rtmd", - cl::desc("Dump AMDGPU runtime metadata")); - -static cl::opt<bool> -CheckRuntimeMDParser("amdgpu-check-rtmd-parser", cl::Hidden, - cl::desc("Check AMDGPU runtime metadata YAML parser")); - -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint8_t) -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t) -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string) -LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata) -LLVM_YAML_IS_SEQUENCE_VECTOR(KernelArg::Metadata) - -namespace llvm { -namespace yaml { - -template <> struct MappingTraits<KernelArg::Metadata> { - static void mapping(IO &YamlIO, KernelArg::Metadata &A) { - YamlIO.mapRequired(KeyName::ArgSize, A.Size); - YamlIO.mapRequired(KeyName::ArgAlign, A.Align); - YamlIO.mapOptional(KeyName::ArgPointeeAlign, A.PointeeAlign, 0U); - YamlIO.mapRequired(KeyName::ArgKind, A.Kind); - YamlIO.mapRequired(KeyName::ArgValueType, A.ValueType); - YamlIO.mapOptional(KeyName::ArgTypeName, A.TypeName, std::string()); - YamlIO.mapOptional(KeyName::ArgName, A.Name, std::string()); - YamlIO.mapOptional(KeyName::ArgAddrQual, A.AddrQual, INVALID_ADDR_QUAL); - YamlIO.mapOptional(KeyName::ArgAccQual, A.AccQual, INVALID_ACC_QUAL); - YamlIO.mapOptional(KeyName::ArgIsVolatile, A.IsVolatile, uint8_t(0)); - YamlIO.mapOptional(KeyName::ArgIsConst, A.IsConst, uint8_t(0)); - YamlIO.mapOptional(KeyName::ArgIsRestrict, A.IsRestrict, uint8_t(0)); - YamlIO.mapOptional(KeyName::ArgIsPipe, A.IsPipe, uint8_t(0)); - } - static const bool flow = true; -}; - -template <> struct MappingTraits<Kernel::Metadata> { - static void mapping(IO &YamlIO, Kernel::Metadata &K) { - YamlIO.mapRequired(KeyName::KernelName, K.Name); - YamlIO.mapOptional(KeyName::Language, K.Language, std::string()); - YamlIO.mapOptional(KeyName::LanguageVersion, K.LanguageVersion); - YamlIO.mapOptional(KeyName::ReqdWorkGroupSize, K.ReqdWorkGroupSize); - YamlIO.mapOptional(KeyName::WorkGroupSizeHint, K.WorkGroupSizeHint); - YamlIO.mapOptional(KeyName::VecTypeHint, K.VecTypeHint, std::string()); - YamlIO.mapOptional(KeyName::KernelIndex, K.KernelIndex, - INVALID_KERNEL_INDEX); - YamlIO.mapOptional(KeyName::NoPartialWorkGroups, K.NoPartialWorkGroups, - uint8_t(0)); - YamlIO.mapRequired(KeyName::Args, K.Args); - } - static const bool flow = true; -}; - -template <> struct MappingTraits<Program::Metadata> { - static void mapping(IO &YamlIO, Program::Metadata &Prog) { - YamlIO.mapRequired(KeyName::MDVersion, Prog.MDVersionSeq); - YamlIO.mapOptional(KeyName::PrintfInfo, Prog.PrintfInfo); - YamlIO.mapOptional(KeyName::Kernels, Prog.Kernels); - } - static const bool flow = true; -}; - -} // end namespace yaml -} // end namespace llvm - -// Get a vector of three integer values from MDNode \p Node; -static std::vector<uint32_t> getThreeInt32(MDNode *Node) { - assert(Node->getNumOperands() == 3); - std::vector<uint32_t> V; - for (const MDOperand &Op : Node->operands()) { - const ConstantInt *CI = mdconst::extract<ConstantInt>(Op); - V.push_back(CI->getZExtValue()); - } - return V; -} - -static std::string getOCLTypeName(Type *Ty, bool Signed) { - switch (Ty->getTypeID()) { - case Type::HalfTyID: - return "half"; - case Type::FloatTyID: - return "float"; - case Type::DoubleTyID: - return "double"; - case Type::IntegerTyID: { - if (!Signed) - return (Twine('u') + getOCLTypeName(Ty, true)).str(); - unsigned BW = Ty->getIntegerBitWidth(); - switch (BW) { - case 8: - return "char"; - case 16: - return "short"; - case 32: - return "int"; - case 64: - return "long"; - default: - return (Twine('i') + Twine(BW)).str(); - } - } - case Type::VectorTyID: { - VectorType *VecTy = cast<VectorType>(Ty); - Type *EleTy = VecTy->getElementType(); - unsigned Size = VecTy->getVectorNumElements(); - return (Twine(getOCLTypeName(EleTy, Signed)) + Twine(Size)).str(); - } - default: - return "unknown"; - } -} - -static KernelArg::ValueType getRuntimeMDValueType( - Type *Ty, StringRef TypeName) { - switch (Ty->getTypeID()) { - case Type::HalfTyID: - return KernelArg::F16; - case Type::FloatTyID: - return KernelArg::F32; - case Type::DoubleTyID: - return KernelArg::F64; - case Type::IntegerTyID: { - bool Signed = !TypeName.startswith("u"); - switch (Ty->getIntegerBitWidth()) { - case 8: - return Signed ? KernelArg::I8 : KernelArg::U8; - case 16: - return Signed ? KernelArg::I16 : KernelArg::U16; - case 32: - return Signed ? KernelArg::I32 : KernelArg::U32; - case 64: - return Signed ? KernelArg::I64 : KernelArg::U64; - default: - // Runtime does not recognize other integer types. Report as struct type. - return KernelArg::Struct; - } - } - case Type::VectorTyID: - return getRuntimeMDValueType(Ty->getVectorElementType(), TypeName); - case Type::PointerTyID: - return getRuntimeMDValueType(Ty->getPointerElementType(), TypeName); - default: - return KernelArg::Struct; - } -} - -static KernelArg::AddressSpaceQualifer getRuntimeAddrSpace( - AMDGPUAS::AddressSpaces A) { - switch (A) { - case AMDGPUAS::GLOBAL_ADDRESS: - return KernelArg::Global; - case AMDGPUAS::CONSTANT_ADDRESS: - return KernelArg::Constant; - case AMDGPUAS::LOCAL_ADDRESS: - return KernelArg::Local; - case AMDGPUAS::FLAT_ADDRESS: - return KernelArg::Generic; - case AMDGPUAS::REGION_ADDRESS: - return KernelArg::Region; - default: - return KernelArg::Private; - } -} - -static KernelArg::Metadata getRuntimeMDForKernelArg(const DataLayout &DL, - Type *T, KernelArg::Kind Kind, StringRef BaseTypeName = "", - StringRef TypeName = "", StringRef ArgName = "", StringRef TypeQual = "", - StringRef AccQual = "") { - - KernelArg::Metadata Arg; - - // Set ArgSize and ArgAlign. - Arg.Size = DL.getTypeAllocSize(T); - Arg.Align = DL.getABITypeAlignment(T); - if (auto PT = dyn_cast<PointerType>(T)) { - auto ET = PT->getElementType(); - if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && ET->isSized()) - Arg.PointeeAlign = DL.getABITypeAlignment(ET); - } - - // Set ArgTypeName. - Arg.TypeName = TypeName; - - // Set ArgName. - Arg.Name = ArgName; - - // Set ArgIsVolatile, ArgIsRestrict, ArgIsConst and ArgIsPipe. - SmallVector<StringRef, 1> SplitQ; - TypeQual.split(SplitQ, " ", -1, false /* Drop empty entry */); - - for (StringRef KeyName : SplitQ) { - auto *P = StringSwitch<uint8_t *>(KeyName) - .Case("volatile", &Arg.IsVolatile) - .Case("restrict", &Arg.IsRestrict) - .Case("const", &Arg.IsConst) - .Case("pipe", &Arg.IsPipe) - .Default(nullptr); - if (P) - *P = 1; - } - - // Set ArgKind. - Arg.Kind = Kind; - - // Set ArgValueType. - Arg.ValueType = getRuntimeMDValueType(T, BaseTypeName); - - // Set ArgAccQual. - if (!AccQual.empty()) { - Arg.AccQual = StringSwitch<KernelArg::AccessQualifer>(AccQual) - .Case("read_only", KernelArg::ReadOnly) - .Case("write_only", KernelArg::WriteOnly) - .Case("read_write", KernelArg::ReadWrite) - .Default(KernelArg::AccNone); - } - - // Set ArgAddrQual. - if (auto *PT = dyn_cast<PointerType>(T)) { - Arg.AddrQual = getRuntimeAddrSpace(static_cast<AMDGPUAS::AddressSpaces>( - PT->getAddressSpace())); - } - - return Arg; -} - -static Kernel::Metadata getRuntimeMDForKernel(const Function &F) { - Kernel::Metadata Kernel; - Kernel.Name = F.getName(); - auto &M = *F.getParent(); - - // Set Language and LanguageVersion. - if (auto MD = M.getNamedMetadata("opencl.ocl.version")) { - if (MD->getNumOperands() != 0) { - auto Node = MD->getOperand(0); - if (Node->getNumOperands() > 1) { - Kernel.Language = "OpenCL C"; - uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0)) - ->getZExtValue(); - uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1)) - ->getZExtValue(); - Kernel.LanguageVersion.push_back(Major); - Kernel.LanguageVersion.push_back(Minor); - } - } - } - - const DataLayout &DL = F.getParent()->getDataLayout(); - for (auto &Arg : F.args()) { - unsigned I = Arg.getArgNo(); - Type *T = Arg.getType(); - auto TypeName = dyn_cast<MDString>(F.getMetadata( - "kernel_arg_type")->getOperand(I))->getString(); - auto BaseTypeName = cast<MDString>(F.getMetadata( - "kernel_arg_base_type")->getOperand(I))->getString(); - StringRef ArgName; - if (auto ArgNameMD = F.getMetadata("kernel_arg_name")) - ArgName = cast<MDString>(ArgNameMD->getOperand(I))->getString(); - auto TypeQual = cast<MDString>(F.getMetadata( - "kernel_arg_type_qual")->getOperand(I))->getString(); - auto AccQual = cast<MDString>(F.getMetadata( - "kernel_arg_access_qual")->getOperand(I))->getString(); - KernelArg::Kind Kind; - if (TypeQual.find("pipe") != StringRef::npos) - Kind = KernelArg::Pipe; - else Kind = StringSwitch<KernelArg::Kind>(BaseTypeName) - .Case("sampler_t", KernelArg::Sampler) - .Case("queue_t", KernelArg::Queue) - .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t", - "image2d_t" , "image2d_array_t", KernelArg::Image) - .Cases("image2d_depth_t", "image2d_array_depth_t", - "image2d_msaa_t", "image2d_array_msaa_t", - "image2d_msaa_depth_t", KernelArg::Image) - .Cases("image2d_array_msaa_depth_t", "image3d_t", - KernelArg::Image) - .Default(isa<PointerType>(T) ? - (T->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ? - KernelArg::DynamicSharedPointer : - KernelArg::GlobalBuffer) : - KernelArg::ByValue); - Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, T, Kind, - BaseTypeName, TypeName, ArgName, TypeQual, AccQual)); - } - - // Emit hidden kernel arguments for OpenCL kernels. - if (F.getParent()->getNamedMetadata("opencl.ocl.version")) { - auto Int64T = Type::getInt64Ty(F.getContext()); - Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T, - KernelArg::HiddenGlobalOffsetX)); - Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T, - KernelArg::HiddenGlobalOffsetY)); - Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T, - KernelArg::HiddenGlobalOffsetZ)); - if (F.getParent()->getNamedMetadata("llvm.printf.fmts")) { - auto Int8PtrT = Type::getInt8PtrTy(F.getContext(), - KernelArg::Global); - Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int8PtrT, - KernelArg::HiddenPrintfBuffer)); - } - } - - // Set ReqdWorkGroupSize, WorkGroupSizeHint, and VecTypeHint. - if (auto RWGS = F.getMetadata("reqd_work_group_size")) - Kernel.ReqdWorkGroupSize = getThreeInt32(RWGS); - - if (auto WGSH = F.getMetadata("work_group_size_hint")) - Kernel.WorkGroupSizeHint = getThreeInt32(WGSH); - - if (auto VTH = F.getMetadata("vec_type_hint")) - Kernel.VecTypeHint = getOCLTypeName(cast<ValueAsMetadata>( - VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>( - VTH->getOperand(1))->getZExtValue()); - - return Kernel; -} - -Program::Metadata::Metadata(const std::string &YAML) { - yaml::Input Input(YAML); - Input >> *this; -} - -std::string Program::Metadata::toYAML(void) { - std::string Text; - raw_string_ostream Stream(Text); - yaml::Output Output(Stream, nullptr, INT_MAX /* do not wrap line */); - Output << *this; - return Stream.str(); -} - -Program::Metadata Program::Metadata::fromYAML(const std::string &S) { - return Program::Metadata(S); -} - -// Check if the YAML string can be parsed. -static void checkRuntimeMDYAMLString(const std::string &YAML) { - auto P = Program::Metadata::fromYAML(YAML); - auto S = P.toYAML(); - llvm::errs() << "AMDGPU runtime metadata parser test " - << (YAML == S ? "passes" : "fails") << ".\n"; - if (YAML != S) { - llvm::errs() << "First output: " << YAML << '\n' - << "Second output: " << S << '\n'; - } -} - -std::string llvm::getRuntimeMDYAMLString(Module &M) { - Program::Metadata Prog; - Prog.MDVersionSeq.push_back(MDVersion); - Prog.MDVersionSeq.push_back(MDRevision); - - // Set PrintfInfo. - if (auto MD = M.getNamedMetadata("llvm.printf.fmts")) { - for (unsigned I = 0; I < MD->getNumOperands(); ++I) { - auto Node = MD->getOperand(I); - if (Node->getNumOperands() > 0) - Prog.PrintfInfo.push_back(cast<MDString>(Node->getOperand(0)) - ->getString()); - } - } - - // Set Kernels. - for (auto &F: M.functions()) { - if (!F.getMetadata("kernel_arg_type")) - continue; - Prog.Kernels.emplace_back(getRuntimeMDForKernel(F)); - } - - auto YAML = Prog.toYAML(); - - if (DumpRuntimeMD) - llvm::errs() << "AMDGPU runtime metadata:\n" << YAML << '\n'; - - if (CheckRuntimeMDParser) - checkRuntimeMDYAMLString(YAML); - - return YAML; -} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h deleted file mode 100644 index a92fdd4..0000000 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h +++ /dev/null @@ -1,26 +0,0 @@ -//===- AMDGPURuntimeMD.h - Generate runtime metadata ---------------*- C++ -*-// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file declares functions for generating runtime metadata. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H -#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H - -#include <string> - -namespace llvm { -class Module; - -// Get runtime metadata as YAML string. -std::string getRuntimeMDYAMLString(Module &M); - -} -#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 3392183..2a0032f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -11,12 +11,13 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPU.h" #include "AMDGPUTargetStreamer.h" +#include "AMDGPU.h" #include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDKernelCodeTUtils.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Metadata.h" @@ -25,9 +26,7 @@ #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCSectionELF.h" -#include "llvm/Support/ELF.h" #include "llvm/Support/FormattedStream.h" -#include "AMDGPURuntimeMD.h" namespace llvm { #include "AMDGPUPTNote.h" @@ -36,9 +35,27 @@ namespace llvm { using namespace llvm; using namespace llvm::AMDGPU; +//===----------------------------------------------------------------------===// +// AMDGPUTargetStreamer +//===----------------------------------------------------------------------===// + AMDGPUTargetStreamer::AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} +void AMDGPUTargetStreamer::EmitStartOfCodeObjectMetadata(const Module &Mod) { + CodeObjectMetadataStreamer.begin(Mod); +} + +void AMDGPUTargetStreamer::EmitKernelCodeObjectMetadata( + const Function &Func, const amd_kernel_code_t &KernelCode) { + CodeObjectMetadataStreamer.emitKernel(Func, KernelCode); +} + +void AMDGPUTargetStreamer::EmitEndOfCodeObjectMetadata() { + CodeObjectMetadataStreamer.end(); + EmitCodeObjectMetadata(CodeObjectMetadataStreamer.toYamlString().get()); +} + //===----------------------------------------------------------------------===// // AMDGPUTargetAsmStreamer //===----------------------------------------------------------------------===// @@ -83,26 +100,16 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, } } -void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal( - StringRef GlobalName) { - OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n'; -} +bool AMDGPUTargetAsmStreamer::EmitCodeObjectMetadata(StringRef YamlString) { + auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString); + if (!VerifiedYamlString) + return false; -void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal( - StringRef GlobalName) { - OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n'; -} + OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin << '\n'; + OS << VerifiedYamlString.get(); + OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd << '\n'; -void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(Module &M) { - OS << "\t.amdgpu_runtime_metadata\n"; - OS << getRuntimeMDYAMLString(M); - OS << "\n\t.end_amdgpu_runtime_metadata\n"; -} - -void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(StringRef Metadata) { - OS << "\t.amdgpu_runtime_metadata"; - OS << Metadata; - OS << "\t.end_amdgpu_runtime_metadata\n"; + return true; } //===----------------------------------------------------------------------===// @@ -116,22 +123,21 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { return static_cast<MCELFStreamer &>(Streamer); } -void -AMDGPUTargetELFStreamer::EmitAMDGPUNote(const MCExpr* DescSZ, - PT_NOTE::NoteType Type, - std::function<void(MCELFStreamer &)> EmitDesc) { +void AMDGPUTargetELFStreamer::EmitAMDGPUNote( + const MCExpr *DescSZ, ElfNote::NoteType Type, + function_ref<void(MCELFStreamer &)> EmitDesc) { auto &S = getStreamer(); auto &Context = S.getContext(); - auto NameSZ = sizeof(PT_NOTE::NoteName); + auto NameSZ = sizeof(ElfNote::NoteName); S.PushSection(); S.SwitchSection(Context.getELFSection( - PT_NOTE::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC)); + ElfNote::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC)); S.EmitIntValue(NameSZ, 4); // namesz S.EmitValue(DescSZ, 4); // descz - S.EmitIntValue(Type, 4); // type - S.EmitBytes(StringRef(PT_NOTE::NoteName, NameSZ)); // name + S.EmitIntValue(Type, 4); // type + S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ)); // name S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 EmitDesc(S); // desc S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 @@ -144,7 +150,7 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major, EmitAMDGPUNote( MCConstantExpr::create(8, getContext()), - PT_NOTE::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, + ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS){ OS.EmitIntValue(Major, 4); OS.EmitIntValue(Minor, 4); @@ -160,14 +166,14 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major, StringRef ArchName) { uint16_t VendorNameSize = VendorName.size() + 1; uint16_t ArchNameSize = ArchName.size() + 1; - + unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) + sizeof(Major) + sizeof(Minor) + sizeof(Stepping) + VendorNameSize + ArchNameSize; EmitAMDGPUNote( MCConstantExpr::create(DescSZ, getContext()), - PT_NOTE::NT_AMDGPU_HSA_ISA, + ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) { OS.EmitIntValue(VendorNameSize, 2); OS.EmitIntValue(ArchNameSize, 2); @@ -198,25 +204,11 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL); } -void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal( - StringRef GlobalName) { - - MCSymbolELF *Symbol = cast<MCSymbolELF>( - getStreamer().getContext().getOrCreateSymbol(GlobalName)); - Symbol->setType(ELF::STT_OBJECT); - Symbol->setBinding(ELF::STB_LOCAL); -} - -void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal( - StringRef GlobalName) { - - MCSymbolELF *Symbol = cast<MCSymbolELF>( - getStreamer().getContext().getOrCreateSymbol(GlobalName)); - Symbol->setType(ELF::STT_OBJECT); - Symbol->setBinding(ELF::STB_GLOBAL); -} +bool AMDGPUTargetELFStreamer::EmitCodeObjectMetadata(StringRef YamlString) { + auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString); + if (!VerifiedYamlString) + return false; -void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) { // Create two labels to mark the beginning and end of the desc field // and a MCExpr to calculate the size of the desc field. auto &Context = getContext(); @@ -228,15 +220,13 @@ void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) { EmitAMDGPUNote( DescSZ, - PT_NOTE::NT_AMDGPU_HSA_RUNTIME_METADATA, + ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_METADATA, [&](MCELFStreamer &OS) { OS.EmitLabel(DescBegin); - OS.EmitBytes(Metadata); + OS.EmitBytes(VerifiedYamlString.get()); OS.EmitLabel(DescEnd); } ); -} -void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(Module &M) { - EmitRuntimeMetadata(getRuntimeMDYAMLString(M)); + return true; } diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index e2f2058..968128e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -10,6 +10,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H +#include "AMDGPUCodeObjectMetadataStreamer.h" #include "AMDKernelCodeT.h" #include "llvm/MC/MCStreamer.h" @@ -26,6 +27,7 @@ class Type; class AMDGPUTargetStreamer : public MCTargetStreamer { protected: + AMDGPU::CodeObject::MetadataStreamer CodeObjectMetadataStreamer; MCContext &getContext() const { return Streamer.getContext(); } public: @@ -42,16 +44,18 @@ public: virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0; - virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0; + virtual void EmitStartOfCodeObjectMetadata(const Module &Mod); - virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0; + virtual void EmitKernelCodeObjectMetadata( + const Function &Func, const amd_kernel_code_t &KernelCode); - virtual void EmitRuntimeMetadata(Module &M) = 0; + virtual void EmitEndOfCodeObjectMetadata(); - virtual void EmitRuntimeMetadata(StringRef Metadata) = 0; + /// \returns True on success, false on failure. + virtual bool EmitCodeObjectMetadata(StringRef YamlString) = 0; }; -class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer { +class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer { formatted_raw_ostream &OS; public: AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); @@ -66,21 +70,16 @@ public: void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; - void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; - - void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; - - void EmitRuntimeMetadata(Module &M) override; - - void EmitRuntimeMetadata(StringRef Metadata) override; + /// \returns True on success, false on failure. + bool EmitCodeObjectMetadata(StringRef YamlString) override; }; -class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer { +class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { MCStreamer &Streamer; - void EmitAMDGPUNote(const MCExpr* DescSize, - AMDGPU::PT_NOTE::NoteType Type, - std::function<void(MCELFStreamer &)> EmitDesc); + void EmitAMDGPUNote(const MCExpr *DescSize, + AMDGPU::ElfNote::NoteType Type, + function_ref<void(MCELFStreamer &)> EmitDesc); public: AMDGPUTargetELFStreamer(MCStreamer &S); @@ -98,13 +97,8 @@ public: void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; - void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; - - void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; - - void EmitRuntimeMetadata(Module &M) override; - - void EmitRuntimeMetadata(StringRef Metadata) override; + /// \returns True on success, false on failure. + bool EmitCodeObjectMetadata(StringRef YamlString) override; }; } diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 6015ec1..eab90e1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -14,10 +14,10 @@ // //===----------------------------------------------------------------------===// -#include "R600Defines.h" #include "MCTargetDesc/AMDGPUFixupKinds.h" #include "MCTargetDesc/AMDGPUMCCodeEmitter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "R600Defines.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCFixup.h" diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 0c5bb06..376c9bf 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -69,6 +69,14 @@ public: unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; + + unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; }; } // end anonymous namespace @@ -220,13 +228,33 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, Imm = MO.getImm(); } - switch (AMDGPU::getOperandSize(OpInfo)) { - case 4: + switch (OpInfo.OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: return getLit32Encoding(static_cast<uint32_t>(Imm), STI); - case 8: + + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: return getLit64Encoding(static_cast<uint64_t>(Imm), STI); - case 2: + + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + // FIXME Is this correct? What do inline immediates do on SI for f16 src + // which does not have f16 support? return getLit16Encoding(static_cast<uint16_t>(Imm), STI); + + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + uint16_t Lo16 = static_cast<uint16_t>(Imm); + uint32_t Encoding = getLit16Encoding(Lo16, STI); + return Encoding; + } default: llvm_unreachable("invalid operand size"); } @@ -297,6 +325,63 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, return getMachineOpValue(MI, MO, Fixups, STI); } +unsigned +SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + using namespace AMDGPU::SDWA; + + uint64_t RegEnc = 0; + + const MCOperand &MO = MI.getOperand(OpNo); + + unsigned Reg = MO.getReg(); + RegEnc |= MRI.getEncodingValue(Reg); + RegEnc &= SDWA9EncValues::SRC_VGPR_MASK; + if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) { + RegEnc |= SDWA9EncValues::SRC_SGPR_MASK; + } + return RegEnc; +} + +unsigned +SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + using namespace AMDGPU::SDWA; + + uint64_t RegEnc = 0; + + const MCOperand &MO = MI.getOperand(OpNo); + + unsigned Reg = MO.getReg(); + if (Reg != AMDGPU::VCC) { + RegEnc |= MRI.getEncodingValue(Reg); + RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK; + RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK; + } + return RegEnc; +} + +static bool needsPCRel(const MCExpr *Expr) { + switch (Expr->getKind()) { + case MCExpr::SymbolRef: + return true; + case MCExpr::Binary: { + auto *BE = cast<MCBinaryExpr>(Expr); + if (BE->getOpcode() == MCBinaryExpr::Sub) + return false; + return needsPCRel(BE->getLHS()) || needsPCRel(BE->getRHS()); + } + case MCExpr::Unary: + return needsPCRel(cast<MCUnaryExpr>(Expr)->getSubExpr()); + case MCExpr::Target: + case MCExpr::Constant: + return false; + } + llvm_unreachable("invalid kind"); +} + uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, @@ -305,12 +390,21 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, return MRI.getEncodingValue(MO.getReg()); if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) { - const auto *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr()); + // FIXME: If this is expression is PCRel or not should not depend on what + // the expression looks like. Given that this is just a general expression, + // it should probably be FK_Data_4 and whatever is producing + // + // s_add_u32 s2, s2, (extern_const_addrspace+16 + // + // And expecting a PCRel should instead produce + // + // .Ltmp1: + // s_add_u32 s2, s2, (extern_const_addrspace+16)-.Ltmp1 MCFixupKind Kind; - if (Expr && Expr->getSymbol().isExternal()) - Kind = FK_Data_4; - else + if (needsPCRel(MO.getExpr())) Kind = FK_PCRel_4; + else + Kind = FK_Data_4; Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc())); } diff --git a/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 46803e5..06e2c11 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -26,6 +26,7 @@ class MIMG_Helper <dag outs, dag ins, string asm, let isAsmParserOnly = !if(!eq(dns,""), 1, 0); let AsmMatchConverter = "cvtMIMG"; let usesCustomInserter = 1; + let SchedRW = [WriteVMEM]; } class MIMG_NoSampler_Helper <bits<7> op, string asm, @@ -475,106 +476,6 @@ class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat < sub0) >; -// ======= SI Image Intrinsics ================ - -// Image load -defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">; -defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">; -def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>; - -// Basic sample -defm : SampleRawPatterns<int_SI_image_sample, "IMAGE_SAMPLE">; -defm : SampleRawPatterns<int_SI_image_sample_cl, "IMAGE_SAMPLE_CL">; -defm : SampleRawPatterns<int_SI_image_sample_d, "IMAGE_SAMPLE_D">; -defm : SampleRawPatterns<int_SI_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">; -defm : SampleRawPatterns<int_SI_image_sample_l, "IMAGE_SAMPLE_L">; -defm : SampleRawPatterns<int_SI_image_sample_b, "IMAGE_SAMPLE_B">; -defm : SampleRawPatterns<int_SI_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">; -defm : SampleRawPatterns<int_SI_image_sample_lz, "IMAGE_SAMPLE_LZ">; -defm : SampleRawPatterns<int_SI_image_sample_cd, "IMAGE_SAMPLE_CD">; -defm : SampleRawPatterns<int_SI_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">; - -// Sample with comparison -defm : SampleRawPatterns<int_SI_image_sample_c, "IMAGE_SAMPLE_C">; -defm : SampleRawPatterns<int_SI_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">; -defm : SampleRawPatterns<int_SI_image_sample_c_d, "IMAGE_SAMPLE_C_D">; -defm : SampleRawPatterns<int_SI_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">; -defm : SampleRawPatterns<int_SI_image_sample_c_l, "IMAGE_SAMPLE_C_L">; -defm : SampleRawPatterns<int_SI_image_sample_c_b, "IMAGE_SAMPLE_C_B">; -defm : SampleRawPatterns<int_SI_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">; -defm : SampleRawPatterns<int_SI_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">; -defm : SampleRawPatterns<int_SI_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">; -defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">; - -// Sample with offsets -defm : SampleRawPatterns<int_SI_image_sample_o, "IMAGE_SAMPLE_O">; -defm : SampleRawPatterns<int_SI_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_d_o, "IMAGE_SAMPLE_D_O">; -defm : SampleRawPatterns<int_SI_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_l_o, "IMAGE_SAMPLE_L_O">; -defm : SampleRawPatterns<int_SI_image_sample_b_o, "IMAGE_SAMPLE_B_O">; -defm : SampleRawPatterns<int_SI_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">; -defm : SampleRawPatterns<int_SI_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">; -defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">; - -// Sample with comparison and offsets -defm : SampleRawPatterns<int_SI_image_sample_c_o, "IMAGE_SAMPLE_C_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">; - -// Gather opcodes -// Only the variants which make sense are defined. -def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>; -def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_cl, IMAGE_GATHER4_CL_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_l, IMAGE_GATHER4_L_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_b, IMAGE_GATHER4_B_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V2, v2i32>; -def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V4, v4i32>; - -def : SampleRawPattern<int_SI_gather4_c, IMAGE_GATHER4_C_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_b_cl, IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_lz, IMAGE_GATHER4_C_LZ_V4_V4, v4i32>; - -def : SampleRawPattern<int_SI_gather4_o, IMAGE_GATHER4_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_b_cl_o, IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_lz_o, IMAGE_GATHER4_LZ_O_V4_V4, v4i32>; - -def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_cl_o, IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_l_o, IMAGE_GATHER4_C_L_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_b_o, IMAGE_GATHER4_C_B_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_b_cl_o, IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>; - -def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>; -def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>; -def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>; - // ======= amdgcn Image Intrinsics ============== // Image load diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td index 3c07cc7..d30d1d3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Processors.td +++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td @@ -80,50 +80,53 @@ def : Proc<"cayman", R600_VLIW4_Itin, // Southern Islands //===----------------------------------------------------------------------===// -def : ProcessorModel<"SI", SIFullSpeedModel, - [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops] +def : ProcessorModel<"gfx600", SIFullSpeedModel, + [FeatureISAVersion6_0_0]>; + +def : ProcessorModel<"SI", SIFullSpeedModel, + [FeatureISAVersion6_0_0] +>; + +def : ProcessorModel<"tahiti", SIFullSpeedModel, + [FeatureISAVersion6_0_0] >; -def : ProcessorModel<"tahiti", SIFullSpeedModel, - [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops] +def : ProcessorModel<"gfx601", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1] >; -def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>; +def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1]>; -def : ProcessorModel<"verde", SIQuarterSpeedModel, [FeatureSouthernIslands]>; +def : ProcessorModel<"verde", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1]>; -def : ProcessorModel<"oland", SIQuarterSpeedModel, [FeatureSouthernIslands]>; +def : ProcessorModel<"oland", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1]>; -def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>; +def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureISAVersion6_0_1]>; //===----------------------------------------------------------------------===// // Sea Islands //===----------------------------------------------------------------------===// -def : ProcessorModel<"bonaire", SIQuarterSpeedModel, +def : ProcessorModel<"gfx700", SIQuarterSpeedModel, [FeatureISAVersion7_0_0] >; -def : ProcessorModel<"kabini", SIQuarterSpeedModel, - [FeatureISAVersion7_0_2] +def : ProcessorModel<"bonaire", SIQuarterSpeedModel, + [FeatureISAVersion7_0_0] >; def : ProcessorModel<"kaveri", SIQuarterSpeedModel, [FeatureISAVersion7_0_0] >; -def : ProcessorModel<"hawaii", SIFullSpeedModel, +def : ProcessorModel<"gfx701", SIFullSpeedModel, [FeatureISAVersion7_0_1] >; -def : ProcessorModel<"mullins", SIQuarterSpeedModel, - [FeatureISAVersion7_0_2]>; - -def : ProcessorModel<"gfx700", SIQuarterSpeedModel, - [FeatureISAVersion7_0_0] ->; - -def : ProcessorModel<"gfx701", SIFullSpeedModel, +def : ProcessorModel<"hawaii", SIFullSpeedModel, [FeatureISAVersion7_0_1] >; @@ -131,6 +134,17 @@ def : ProcessorModel<"gfx702", SIQuarterSpeedModel, [FeatureISAVersion7_0_2] >; +def : ProcessorModel<"gfx703", SIQuarterSpeedModel, + [FeatureISAVersion7_0_3] +>; + +def : ProcessorModel<"kabini", SIQuarterSpeedModel, + [FeatureISAVersion7_0_3] +>; + +def : ProcessorModel<"mullins", SIQuarterSpeedModel, + [FeatureISAVersion7_0_3]>; + //===----------------------------------------------------------------------===// // Volcanic Islands //===----------------------------------------------------------------------===// @@ -187,3 +201,23 @@ def : ProcessorModel<"gfx810", SIQuarterSpeedModel, [FeatureISAVersion8_1_0] >; +//===----------------------------------------------------------------------===// +// GFX9 +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx900", SIQuarterSpeedModel, + [FeatureISAVersion9_0_0] +>; + +def : ProcessorModel<"gfx901", SIQuarterSpeedModel, + [FeatureISAVersion9_0_1] +>; + +def : ProcessorModel<"gfx902", SIQuarterSpeedModel, + [FeatureISAVersion9_0_2] +>; + +def : ProcessorModel<"gfx903", SIQuarterSpeedModel, + [FeatureISAVersion9_0_3] +>; + diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp index d0aba38..fbe45cb 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -62,7 +62,7 @@ private: const MachineInstr &LatrCFAlu) const; public: - R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { } + R600ClauseMergePass() : MachineFunctionPass(ID) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -208,6 +208,6 @@ StringRef R600ClauseMergePass::getPassName() const { } // end anonymous namespace -llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) { - return new R600ClauseMergePass(TM); +llvm::FunctionPass *llvm::createR600ClauseMergePass() { + return new R600ClauseMergePass(); } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 45b36d3..00cbd24 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -12,17 +12,33 @@ /// computing their address on the fly ; it also sets STACK_SIZE info. //===----------------------------------------------------------------------===// -#include "llvm/Support/Debug.h" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <new> +#include <set> +#include <utility> +#include <vector> using namespace llvm; @@ -43,13 +59,12 @@ struct CFStack { std::vector<StackItem> BranchStack; std::vector<StackItem> LoopStack; unsigned MaxStackSize; - unsigned CurrentEntries; - unsigned CurrentSubEntries; + unsigned CurrentEntries = 0; + unsigned CurrentSubEntries = 0; CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st), // We need to reserve a stack entry for CALL_FS in vertex shaders. - MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0), - CurrentEntries(0), CurrentSubEntries(0) { } + MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0) {} unsigned getLoopDepth(); bool branchStackContains(CFStack::StackItem); @@ -198,9 +213,8 @@ void CFStack::popLoop() { } class R600ControlFlowFinalizer : public MachineFunctionPass { - private: - typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; + typedef std::pair<MachineInstr *, std::vector<MachineInstr *>> ClauseFile; enum ControlFlowInstruction { CF_TC, @@ -217,10 +231,10 @@ private: }; static char ID; - const R600InstrInfo *TII; - const R600RegisterInfo *TRI; + const R600InstrInfo *TII = nullptr; + const R600RegisterInfo *TRI = nullptr; unsigned MaxFetchInst; - const R600Subtarget *ST; + const R600Subtarget *ST = nullptr; bool IsTrivialInst(MachineInstr &MI) const { switch (MI.getOpcode()) { @@ -355,7 +369,7 @@ private: continue; int64_t Imm = Src.second; std::vector<MachineOperand *>::iterator It = - find_if(Lits, [&](MachineOperand *val) { + llvm::find_if(Lits, [&](MachineOperand *val) { return val->isImm() && (val->getImm() == Imm); }); @@ -485,8 +499,7 @@ private: } public: - R600ControlFlowFinalizer(TargetMachine &tm) - : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {} + R600ControlFlowFinalizer() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { ST = &MF.getSubtarget<R600Subtarget>(); @@ -501,7 +514,7 @@ public: ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; - std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; + std::vector<std::pair<unsigned, std::set<MachineInstr *>>> LoopStack; std::vector<MachineInstr * > IfThenElseStack; if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), @@ -542,7 +555,7 @@ public: CFStack.pushBranch(AMDGPU::CF_PUSH_EG); } else CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); - + LLVM_FALLTHROUGH; case AMDGPU::CF_ALU: I = MI; AluClauses.push_back(MakeALUClause(MBB, I)); @@ -554,7 +567,7 @@ public: MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_WHILE_LOOP)) .addImm(1); - std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, + std::pair<unsigned, std::set<MachineInstr *>> Pair(CfCount, std::set<MachineInstr *>()); Pair.second.insert(MIb); LoopStack.push_back(std::move(Pair)); @@ -564,7 +577,7 @@ public: } case AMDGPU::ENDLOOP: { CFStack.popLoop(); - std::pair<unsigned, std::set<MachineInstr *> > Pair = + std::pair<unsigned, std::set<MachineInstr *>> Pair = std::move(LoopStack.back()); LoopStack.pop_back(); CounterPropagateAddr(Pair.second, CfCount); @@ -693,7 +706,6 @@ char R600ControlFlowFinalizer::ID = 0; } // end anonymous namespace - -llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { - return new R600ControlFlowFinalizer(TM); +FunctionPass *llvm::createR600ControlFlowFinalizer() { + return new R600ControlFlowFinalizer(); } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index 9a5db6c..0d8ccd0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -15,28 +15,39 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" -#include "AMDGPUSubtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/Pass.h" +#include "llvm/Support/ErrorHandling.h" +#include <cassert> +#include <cstdint> +#include <utility> +#include <vector> using namespace llvm; namespace llvm { + void initializeR600EmitClauseMarkersPass(PassRegistry&); -} + +} // end namespace llvm namespace { class R600EmitClauseMarkers : public MachineFunctionPass { - private: - const R600InstrInfo *TII; - int Address; + const R600InstrInfo *TII = nullptr; + int Address = 0; unsigned OccupiedDwords(MachineInstr &MI) const { switch (MI.getOpcode()) { @@ -118,7 +129,7 @@ private: SubstituteKCacheBank(MachineInstr &MI, std::vector<std::pair<unsigned, unsigned>> &CachedConsts, bool UpdateInstr = true) const { - std::vector<std::pair<unsigned, unsigned> > UsedKCache; + std::vector<std::pair<unsigned, unsigned>> UsedKCache; if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4) return true; @@ -181,10 +192,11 @@ private: bool canClauseLocalKillFitInClause( unsigned AluInstCount, - std::vector<std::pair<unsigned, unsigned> > KCacheBanks, + std::vector<std::pair<unsigned, unsigned>> KCacheBanks, MachineBasicBlock::iterator Def, MachineBasicBlock::iterator BBEnd) { const R600RegisterInfo &TRI = TII->getRegisterInfo(); + //TODO: change this to defs? for (MachineInstr::const_mop_iterator MOI = Def->operands_begin(), MOE = Def->operands_end(); MOI != MOE; ++MOI) { @@ -207,15 +219,17 @@ private: if (AluInstCount >= TII->getMaxAlusPerClause()) return false; + // TODO: Is this true? kill flag appears to work OK below // Register kill flags have been cleared by the time we get to this // pass, but it is safe to assume that all uses of this register // occur in the same basic block as its definition, because // it is illegal for the scheduler to schedule them in // different blocks. - if (UseI->findRegisterUseOperandIdx(MOI->getReg())) + if (UseI->readsRegister(MOI->getReg())) LastUseCount = AluInstCount; - if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) + // Exit early if the current use kills the register + if (UseI != Def && UseI->killsRegister(MOI->getReg())) break; } if (LastUseCount) @@ -228,7 +242,7 @@ private: MachineBasicBlock::iterator MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { MachineBasicBlock::iterator ClauseHead = I; - std::vector<std::pair<unsigned, unsigned> > KCacheBanks; + std::vector<std::pair<unsigned, unsigned>> KCacheBanks; bool PushBeforeModifier = false; unsigned AluInstCount = 0; for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { @@ -294,8 +308,8 @@ private: public: static char ID; - R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) { + R600EmitClauseMarkers() : MachineFunctionPass(ID) { initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); } @@ -310,9 +324,11 @@ public: if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU) continue; // BB was already parsed for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { - if (isALU(*I)) - I = MakeALUClause(MBB, I); - else + if (isALU(*I)) { + auto next = MakeALUClause(MBB, I); + assert(next != I); + I = next; + } else ++I; } } @@ -333,7 +349,6 @@ INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", "R600 Emit Clause Markters", false, false) -llvm::FunctionPass *llvm::createR600EmitClauseMarkers() { +FunctionPass *llvm::createR600EmitClauseMarkers() { return new R600EmitClauseMarkers(); } - diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index 3e46e63..66def2d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -15,11 +15,11 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" -#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -37,7 +37,7 @@ private: unsigned Op); public: - R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID), + R600ExpandSpecialInstrsPass() : MachineFunctionPass(ID), TII(nullptr) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -51,8 +51,8 @@ public: char R600ExpandSpecialInstrsPass::ID = 0; -FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) { - return new R600ExpandSpecialInstrsPass(TM); +FunctionPass *llvm::createR600ExpandSpecialInstrsPass() { + return new R600ExpandSpecialInstrsPass(); } void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, diff --git a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp index 5813786..37787b3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp @@ -8,7 +8,43 @@ //==-----------------------------------------------------------------------===// #include "R600FrameLowering.h" +#include "AMDGPUSubtarget.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; R600FrameLowering::~R600FrameLowering() = default; + +/// \returns The number of registers allocated for \p FI. +int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const R600RegisterInfo *RI + = MF.getSubtarget<R600Subtarget>().getRegisterInfo(); + + // Fill in FrameReg output argument. + FrameReg = RI->getFrameRegister(MF); + + // Start the offset at 2 so we don't overwrite work group information. + // FIXME: We should only do this when the shader actually uses this + // information. + unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4); + int UpperBound = FI == -1 ? MFI.getNumObjects() : FI; + + for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) { + OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i)); + OffsetBytes += MFI.getObjectSize(i); + // Each register holds 4 bytes, so we must always align the offset to at + // least 4 bytes, so that 2 frame objects won't share the same register. + OffsetBytes = alignTo(OffsetBytes, 4); + } + + if (FI != -1) + OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI)); + + return OffsetBytes / (getStackWidth(MF) * 4); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h index 874435f..142f709 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h @@ -25,6 +25,8 @@ public: MachineBasicBlock &MBB) const override {} void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override {} + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 77fee435..69a63b6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -221,6 +221,15 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SUBE, VT, Expand); } + // LLVM will expand these to atomic_cmp_swap(0) + // and atomic_swap, respectively. + setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand); + + // We need to custom lower some of the intrinsics + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setSchedulingPreference(Sched::Source); setTargetDAGCombine(ISD::FP_ROUND); @@ -266,7 +275,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode()))); for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) { - NewMI.addOperand(MI.getOperand(i)); + NewMI.add(MI.getOperand(i)); } } else { return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); @@ -339,34 +348,34 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case AMDGPU::RAT_WRITE_CACHELESS_64_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) - .addOperand(MI.getOperand(0)) - .addOperand(MI.getOperand(1)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) .addImm(isEOP(I)); // Set End of program bit break; case AMDGPU::RAT_STORE_TYPED_eg: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) - .addOperand(MI.getOperand(0)) - .addOperand(MI.getOperand(1)) - .addOperand(MI.getOperand(2)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) .addImm(isEOP(I)); // Set End of program bit break; case AMDGPU::BRANCH: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) - .addOperand(MI.getOperand(0)); + .add(MI.getOperand(0)); break; case AMDGPU::BRANCH_COND_f32: { MachineInstr *NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), AMDGPU::PREDICATE_BIT) - .addOperand(MI.getOperand(1)) + .add(MI.getOperand(1)) .addImm(AMDGPU::PRED_SETNE) .addImm(0); // Flags TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; } @@ -375,12 +384,12 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineInstr *NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), AMDGPU::PREDICATE_BIT) - .addOperand(MI.getOperand(1)) + .add(MI.getOperand(1)) .addImm(AMDGPU::PRED_SETNE_INT) .addImm(0); // Flags TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; } @@ -408,13 +417,13 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return BB; unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40; BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) - .addOperand(MI.getOperand(0)) - .addOperand(MI.getOperand(1)) - .addOperand(MI.getOperand(2)) - .addOperand(MI.getOperand(3)) - .addOperand(MI.getOperand(4)) - .addOperand(MI.getOperand(5)) - .addOperand(MI.getOperand(6)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(5)) + .add(MI.getOperand(6)) .addImm(CfInst) .addImm(EOP); break; @@ -490,8 +499,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); EVT VT = Op.getValueType(); SDLoc DL(Op); - switch(IntrinsicID) { - default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + switch (IntrinsicID) { case AMDGPUIntrinsic::r600_tex: case AMDGPUIntrinsic::r600_texc: { unsigned TextureOp; @@ -552,7 +560,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const } case Intrinsic::r600_implicitarg_ptr: { - MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS); + MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS); uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); return DAG.getConstant(ByteOffset, DL, PtrVT); } @@ -576,29 +584,31 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const return LowerImplicitParameter(DAG, VT, DL, 8); case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_X, VT); + return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_X, VT); case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_Y, VT); + return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_Y, VT); case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_Z, VT); + return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_Z, VT); case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_X, VT); + return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_X, VT); case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_Y, VT); + return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_Y, VT); case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_Z, VT); + return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_Z, VT); case Intrinsic::r600_recipsqrt_ieee: return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); case Intrinsic::r600_recipsqrt_clamped: return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); + default: + return Op; } // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) @@ -702,12 +712,12 @@ SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); - if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); const DataLayout &DL = DAG.getDataLayout(); const GlobalValue *GV = GSD->getGlobal(); - MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + MVT ConstPtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS); SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT); return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA); @@ -864,7 +874,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, unsigned DwordOffset) const { unsigned ByteOffset = DwordOffset * 4; PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_BUFFER_0); + AMDGPUASI.CONSTANT_BUFFER_0); // We shouldn't be using an offset wider than 16-bits for implicit parameters. assert(isInt<16>(ByteOffset)); @@ -911,7 +921,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const if (VT == MVT::f32) { DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); - SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); + SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); if (MinMax) return MinMax; } @@ -1102,7 +1112,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, //TODO: Who creates the i8 stores? assert(Store->isTruncatingStore() || Store->getValue().getValueType() == MVT::i8); - assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS); + assert(Store->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS); SDValue Mask; if (Store->getMemoryVT() == MVT::i8) { @@ -1110,7 +1120,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, Mask = DAG.getConstant(0xff, DL, MVT::i32); } else if (Store->getMemoryVT() == MVT::i16) { assert(Store->getAlignment() >= 2); - Mask = DAG.getConstant(0xffff, DL, MVT::i32);; + Mask = DAG.getConstant(0xffff, DL, MVT::i32); } else { llvm_unreachable("Unsupported private trunc store"); } @@ -1200,9 +1210,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); // Neither LOCAL nor PRIVATE can do vectors at the moment - if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && + if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS) && VT.isVector()) { - if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && StoreNode->isTruncatingStore()) { + if ((AS == AMDGPUASI.PRIVATE_ADDRESS) && + StoreNode->isTruncatingStore()) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); // TODO: can the chain be replaced without creating a new store? @@ -1225,7 +1236,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr, DAG.getConstant(2, DL, PtrVT)); - if (AS == AMDGPUAS::GLOBAL_ADDRESS) { + if (AS == AMDGPUASI.GLOBAL_ADDRESS) { // It is beneficial to create MSKOR here instead of combiner to avoid // artificial dependencies introduced by RMW if (StoreNode->isTruncatingStore()) { @@ -1278,7 +1289,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes - if (AS != AMDGPUAS::PRIVATE_ADDRESS) + if (AS != AMDGPUASI.PRIVATE_ADDRESS) return SDValue(); if (MemVT.bitsLT(MVT::i32)) @@ -1297,39 +1308,39 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // return (512 + (kc_bank << 12) static int -ConstantAddressBlock(unsigned AddressSpace) { +ConstantAddressBlock(unsigned AddressSpace, AMDGPUAS AMDGPUASI) { switch (AddressSpace) { - case AMDGPUAS::CONSTANT_BUFFER_0: + case AMDGPUASI.CONSTANT_BUFFER_0: return 512; - case AMDGPUAS::CONSTANT_BUFFER_1: + case AMDGPUASI.CONSTANT_BUFFER_1: return 512 + 4096; - case AMDGPUAS::CONSTANT_BUFFER_2: + case AMDGPUASI.CONSTANT_BUFFER_2: return 512 + 4096 * 2; - case AMDGPUAS::CONSTANT_BUFFER_3: + case AMDGPUASI.CONSTANT_BUFFER_3: return 512 + 4096 * 3; - case AMDGPUAS::CONSTANT_BUFFER_4: + case AMDGPUASI.CONSTANT_BUFFER_4: return 512 + 4096 * 4; - case AMDGPUAS::CONSTANT_BUFFER_5: + case AMDGPUASI.CONSTANT_BUFFER_5: return 512 + 4096 * 5; - case AMDGPUAS::CONSTANT_BUFFER_6: + case AMDGPUASI.CONSTANT_BUFFER_6: return 512 + 4096 * 6; - case AMDGPUAS::CONSTANT_BUFFER_7: + case AMDGPUASI.CONSTANT_BUFFER_7: return 512 + 4096 * 7; - case AMDGPUAS::CONSTANT_BUFFER_8: + case AMDGPUASI.CONSTANT_BUFFER_8: return 512 + 4096 * 8; - case AMDGPUAS::CONSTANT_BUFFER_9: + case AMDGPUASI.CONSTANT_BUFFER_9: return 512 + 4096 * 9; - case AMDGPUAS::CONSTANT_BUFFER_10: + case AMDGPUASI.CONSTANT_BUFFER_10: return 512 + 4096 * 10; - case AMDGPUAS::CONSTANT_BUFFER_11: + case AMDGPUASI.CONSTANT_BUFFER_11: return 512 + 4096 * 11; - case AMDGPUAS::CONSTANT_BUFFER_12: + case AMDGPUASI.CONSTANT_BUFFER_12: return 512 + 4096 * 12; - case AMDGPUAS::CONSTANT_BUFFER_13: + case AMDGPUASI.CONSTANT_BUFFER_13: return 512 + 4096 * 13; - case AMDGPUAS::CONSTANT_BUFFER_14: + case AMDGPUASI.CONSTANT_BUFFER_14: return 512 + 4096 * 14; - case AMDGPUAS::CONSTANT_BUFFER_15: + case AMDGPUASI.CONSTANT_BUFFER_15: return 512 + 4096 * 15; default: return -1; @@ -1397,7 +1408,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT MemVT = LoadNode->getMemoryVT(); ISD::LoadExtType ExtType = LoadNode->getExtensionType(); - if (AS == AMDGPUAS::PRIVATE_ADDRESS && + if (AS == AMDGPUASI.PRIVATE_ADDRESS && ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) { return lowerPrivateExtLoad(Op, DAG); } @@ -1407,13 +1418,14 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = LoadNode->getChain(); SDValue Ptr = LoadNode->getBasePtr(); - if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && + if ((LoadNode->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS || + LoadNode->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS) && VT.isVector()) { return scalarizeVectorLoad(LoadNode, DAG); } - int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); + int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace(), + AMDGPUASI); if (ConstantBlock > -1 && ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { @@ -1445,7 +1457,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, DL, MVT::i32)), DAG.getConstant(LoadNode->getAddressSpace() - - AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) + AMDGPUASI.CONSTANT_BUFFER_0, DL, MVT::i32) ); } @@ -1481,7 +1493,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(MergedValues, DL); } - if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + if (LoadNode->getAddressSpace() != AMDGPUASI.PRIVATE_ADDRESS) { return SDValue(); } @@ -1535,7 +1547,7 @@ SDValue R600TargetLowering::LowerFormalArguments( SmallVector<ISD::InputArg, 8> LocalIns; if (AMDGPU::isShader(CallConv)) { - AnalyzeFormalArguments(CCInfo, Ins); + CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); } else { analyzeFormalArgumentsCompute(CCInfo, Ins); } @@ -1558,7 +1570,7 @@ SDValue R600TargetLowering::LowerFormalArguments( } PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_BUFFER_0); + AMDGPUASI.CONSTANT_BUFFER_0); // i64 isn't a legal type, so the register type used ends up as i32, which // isn't expected here. It attempts to create this sextload, but it ends up @@ -1606,6 +1618,15 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, return VT.changeVectorElementTypeToInteger(); } +bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, + const SelectionDAG &DAG) const { + // Local and Private addresses do not handle vectors. Limit to i32 + if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) { + return (MemVT.getSizeInBits() <= 32); + } + return true; +} + bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h index 9700ce1..2a77469 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -44,6 +44,9 @@ public: EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const override; + bool canMergeStoresTo(unsigned AS, EVT MemVT, + const SelectionDAG &DAG) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *IsFast) const override; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index e88bd07..c5da5e4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -14,14 +14,32 @@ #include "R600InstrInfo.h" #include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDGPUTargetMachine.h" #include "R600Defines.h" -#include "R600MachineFunctionInfo.h" +#include "R600FrameLowering.h" #include "R600RegisterInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> +#include <iterator> +#include <utility> +#include <vector> using namespace llvm; @@ -191,7 +209,7 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const { const MachineFunction *MF = MI.getParent()->getParent(); return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) && usesVertexCache(MI.getOpcode())) || - usesTextureCache(MI.getOpcode()); + usesTextureCache(MI.getOpcode()); } bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { @@ -321,7 +339,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI, unsigned &ConstCount) const { ConstCount = 0; const std::pair<int, unsigned> DummyPair(-1, 0); - std::vector<std::pair<int, unsigned> > Result; + std::vector<std::pair<int, unsigned>> Result; unsigned i = 0; for (const auto &Src : getSrcs(MI)) { ++i; @@ -348,8 +366,8 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI, return Result; } -static std::vector<std::pair<int, unsigned> > -Swizzle(std::vector<std::pair<int, unsigned> > Src, +static std::vector<std::pair<int, unsigned>> +Swizzle(std::vector<std::pair<int, unsigned>> Src, R600InstrInfo::BankSwizzle Swz) { if (Src[0] == Src[1]) Src[1].first = -1; @@ -404,14 +422,14 @@ static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { /// in the same Instruction Group while meeting read port limitations given a /// Swz swizzle sequence. unsigned R600InstrInfo::isLegalUpTo( - const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, + const std::vector<std::vector<std::pair<int, unsigned>>> &IGSrcs, const std::vector<R600InstrInfo::BankSwizzle> &Swz, - const std::vector<std::pair<int, unsigned> > &TransSrcs, + const std::vector<std::pair<int, unsigned>> &TransSrcs, R600InstrInfo::BankSwizzle TransSwz) const { int Vector[4][3]; memset(Vector, -1, sizeof(Vector)); for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) { - const std::vector<std::pair<int, unsigned> > &Srcs = + const std::vector<std::pair<int, unsigned>> &Srcs = Swizzle(IGSrcs[i], Swz[i]); for (unsigned j = 0; j < 3; j++) { const std::pair<int, unsigned> &Src = Srcs[j]; @@ -473,9 +491,9 @@ NextPossibleSolution( /// Enumerate all possible Swizzle sequence to find one that can meet all /// read port requirements. bool R600InstrInfo::FindSwizzleForVectorSlot( - const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, + const std::vector<std::vector<std::pair<int, unsigned>>> &IGSrcs, std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate, - const std::vector<std::pair<int, unsigned> > &TransSrcs, + const std::vector<std::pair<int, unsigned>> &TransSrcs, R600InstrInfo::BankSwizzle TransSwz) const { unsigned ValidUpTo = 0; do { @@ -490,7 +508,7 @@ bool R600InstrInfo::FindSwizzleForVectorSlot( /// a const, and can't read a gpr at cycle 1 if they read 2 const. static bool isConstCompatible(R600InstrInfo::BankSwizzle TransSwz, - const std::vector<std::pair<int, unsigned> > &TransOps, + const std::vector<std::pair<int, unsigned>> &TransOps, unsigned ConstCount) { // TransALU can't read 3 constants if (ConstCount > 2) @@ -516,7 +534,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, const { //Todo : support shared src0 - src1 operand - std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs; + std::vector<std::vector<std::pair<int, unsigned>>> IGSrcs; ValidSwizzle.clear(); unsigned ConstCount; BankSwizzle TransBS = ALU_VEC_012_SCL_210; @@ -527,7 +545,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) IG[i]->getOperand(Op).getImm()); } - std::vector<std::pair<int, unsigned> > TransOps; + std::vector<std::pair<int, unsigned>> TransOps; if (!isLastAluTrans) return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS); @@ -556,7 +574,6 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, return false; } - bool R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts) const { @@ -780,7 +797,7 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB, unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const { - assert(!BytesRemoved && "code size not handled"); + assert(!BytesRemoved && "code size not handled"); // Note : we leave PRED* instructions there. // They may be needed when predicating instructions. @@ -852,7 +869,7 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const { } } -bool R600InstrInfo::isPredicable(MachineInstr &MI) const { +bool R600InstrInfo::isPredicable(const MachineInstr &MI) const { // XXX: KILL* instructions can be predicated, but they must be the last // instruction in a clause, so this means any instructions after them cannot // be predicated. Until we have proper support for instruction clauses in the @@ -863,7 +880,7 @@ bool R600InstrInfo::isPredicable(MachineInstr &MI) const { } else if (MI.getOpcode() == AMDGPU::CF_ALU) { // If the clause start in the middle of MBB then the MBB has more // than a single clause, unable to predicate several clauses. - if (MI.getParent()->begin() != MachineBasicBlock::iterator(MI)) + if (MI.getParent()->begin() != MachineBasicBlock::const_iterator(MI)) return false; // TODO: We don't support KC merging atm return MI.getOperand(3).getImm() == 0 && MI.getOperand(4).getImm() == 0; @@ -874,10 +891,9 @@ bool R600InstrInfo::isPredicable(MachineInstr &MI) const { } } - bool R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, + unsigned NumCycles, unsigned ExtraPredCycles, BranchProbability Probability) const{ return true; @@ -896,7 +912,7 @@ R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, bool R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, + unsigned NumCycles, BranchProbability Probability) const { return true; @@ -908,7 +924,6 @@ R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, return false; } - bool R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { MachineOperand &MO = Cond[1]; @@ -948,7 +963,6 @@ bool R600InstrInfo::DefinesPredicate(MachineInstr &MI, return isPredicateSetter(MI.getOpcode()); } - bool R600InstrInfo::PredicateInstruction(MachineInstr &MI, ArrayRef<MachineOperand> Pred) const { int PIdx = MI.findFirstPredOperandIdx(); @@ -1067,7 +1081,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } -void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, +void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const { const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); const R600FrameLowering *TFL = ST.getFrameLowering(); diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h index a280052..3b82800 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h @@ -177,12 +177,12 @@ public: bool isPredicated(const MachineInstr &MI) const override; - bool isPredicable(MachineInstr &MI) const override; + bool isPredicable(const MachineInstr &MI) const override; - bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, BranchProbability Probability) const override; - bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, BranchProbability Probability) const override ; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td index 9210e66..bac557b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -316,7 +316,7 @@ class VTX_READ <string name, dag outs, list<dag> pattern> class LoadParamFrag <PatFrag load_type> : PatFrag < (ops node:$ptr), (load_type node:$ptr), [{ return isConstantLoad(cast<LoadSDNode>(N), 0) || - (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }] + (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.PARAM_I_ADDRESS); }] >; def vtx_id3_az_extloadi8 : LoadParamFrag<az_extloadi8>; @@ -326,8 +326,8 @@ def vtx_id3_load : LoadParamFrag<load>; class LoadVtxId1 <PatFrag load> : PatFrag < (ops node:$ptr), (load node:$ptr), [{ const MemSDNode *LD = cast<MemSDNode>(N); - return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || - (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + return LD->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || + (LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && !isa<GlobalValue>(GetUnderlyingObject( LD->getMemOperand()->getValue(), CurDAG->getDataLayout()))); }]>; @@ -339,7 +339,7 @@ def vtx_id1_load : LoadVtxId1 <load>; class LoadVtxId2 <PatFrag load> : PatFrag < (ops node:$ptr), (load node:$ptr), [{ const MemSDNode *LD = cast<MemSDNode>(N); - return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + return LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && isa<GlobalValue>(GetUnderlyingObject( LD->getMemOperand()->getValue(), CurDAG->getDataLayout())); }]>; @@ -1013,7 +1013,7 @@ multiclass CUBE_Common <bits<11> inst> { (outs R600_Reg128:$dst), (ins R600_Reg128:$src0), "CUBE $dst $src0", - [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))], + [(set v4f32:$dst, (int_r600_cube v4f32:$src0))], VecALU > { let isPseudo = 1; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td index a5310e9..4c9e1e8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td +++ b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td @@ -61,7 +61,7 @@ def int_r600_ddx : TextureIntrinsicFloatInput; def int_r600_ddy : TextureIntrinsicFloatInput; def int_r600_dot4 : Intrinsic<[llvm_float_ty], - [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem] + [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] >; } // End TargetPrefix = "r600", isTarget = 1 diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp index db18e5b..a7e540f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -13,16 +13,16 @@ //===----------------------------------------------------------------------===// #include "R600MachineScheduler.h" -#include "R600InstrInfo.h" #include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Pass.h" #include "llvm/IR/LegacyPassManager.h" +#include "llvm/Pass.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index d90008a..502dd3b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -124,7 +124,7 @@ private: public: static char ID; - R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID), + R600VectorRegMerger() : MachineFunctionPass(ID), TII(nullptr) { } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -396,6 +396,6 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { return false; } -llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) { - return new R600VectorRegMerger(tm); +llvm::FunctionPass *llvm::createR600VectorRegMerger() { + return new R600VectorRegMerger(); } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index 5b6dd1e..1cb4093 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -14,7 +14,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Support/Debug.h" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" @@ -24,6 +23,7 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -36,7 +36,7 @@ class R600Packetizer : public MachineFunctionPass { public: static char ID; - R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} + R600Packetizer() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -404,6 +404,6 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { } // end anonymous namespace -llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) { - return new R600Packetizer(tm); +llvm::FunctionPass *llvm::createR600Packetizer() { + return new R600Packetizer(); } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp index dfdc602..7501fac 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -56,6 +56,18 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } +// Dummy to not crash RegisterClassInfo. +static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister; + +const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs( + const MachineFunction *) const { + return &CalleeSavedReg; +} + +unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const { + return AMDGPU::NoRegister; +} + unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { return this->getEncodingValue(reg) >> HW_CHAN_SHIFT; } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h index 9dfb310..f0d9644 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h @@ -27,6 +27,8 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo { R600RegisterInfo(); BitVector getReservedRegs(const MachineFunction &MF) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + unsigned getFrameRegister(const MachineFunction &MF) const override; /// \brief get the HW encoding for a register's channel. unsigned getHWRegChan(unsigned reg) const; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td index cc667d9..3c1e852 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td @@ -226,7 +226,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add R600_Addr, R600_KC0, R600_KC1, ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, - ALU_CONST, ALU_PARAM, OQAP + ALU_CONST, ALU_PARAM, OQAP, INDIRECT_BASE_ADDR )>; def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index d70f52e..8cb35c5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; @@ -34,15 +35,6 @@ namespace { typedef std::pair<BasicBlock *, Value *> StackEntry; typedef SmallVector<StackEntry, 16> StackVector; -// Intrinsic names the control flow is annotated with -static const char *const IfIntrinsic = "llvm.amdgcn.if"; -static const char *const ElseIntrinsic = "llvm.amdgcn.else"; -static const char *const BreakIntrinsic = "llvm.amdgcn.break"; -static const char *const IfBreakIntrinsic = "llvm.amdgcn.if.break"; -static const char *const ElseBreakIntrinsic = "llvm.amdgcn.else.break"; -static const char *const LoopIntrinsic = "llvm.amdgcn.loop"; -static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf"; - class SIAnnotateControlFlow : public FunctionPass { DivergenceAnalysis *DA; @@ -56,13 +48,13 @@ class SIAnnotateControlFlow : public FunctionPass { UndefValue *BoolUndef; Constant *Int64Zero; - Constant *If; - Constant *Else; - Constant *Break; - Constant *IfBreak; - Constant *ElseBreak; - Constant *Loop; - Constant *EndCf; + Function *If; + Function *Else; + Function *Break; + Function *IfBreak; + Function *ElseBreak; + Function *Loop; + Function *EndCf; DominatorTree *DT; StackVector Stack; @@ -85,8 +77,10 @@ class SIAnnotateControlFlow : public FunctionPass { void insertElse(BranchInst *Term); - Value *handleLoopCondition(Value *Cond, PHINode *Broken, - llvm::Loop *L, BranchInst *Term); + Value * + handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L, + BranchInst *Term, + SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions); void handleLoop(BranchInst *Term); @@ -118,6 +112,7 @@ public: INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) @@ -131,37 +126,20 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) { Void = Type::getVoidTy(Context); Boolean = Type::getInt1Ty(Context); Int64 = Type::getInt64Ty(Context); - ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr); + ReturnStruct = StructType::get(Boolean, Int64); BoolTrue = ConstantInt::getTrue(Context); BoolFalse = ConstantInt::getFalse(Context); BoolUndef = UndefValue::get(Boolean); Int64Zero = ConstantInt::get(Int64, 0); - If = M.getOrInsertFunction( - IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr); - - Else = M.getOrInsertFunction( - ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr); - - Break = M.getOrInsertFunction( - BreakIntrinsic, Int64, Int64, (Type *)nullptr); - cast<Function>(Break)->setDoesNotAccessMemory(); - - IfBreak = M.getOrInsertFunction( - IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr); - cast<Function>(IfBreak)->setDoesNotAccessMemory();; - - ElseBreak = M.getOrInsertFunction( - ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr); - cast<Function>(ElseBreak)->setDoesNotAccessMemory(); - - Loop = M.getOrInsertFunction( - LoopIntrinsic, Boolean, Int64, (Type *)nullptr); - - EndCf = M.getOrInsertFunction( - EndCfIntrinsic, Void, Int64, (Type *)nullptr); - + If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if); + Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else); + Break = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_break); + IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break); + ElseBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else_break); + Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop); + EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf); return false; } @@ -208,15 +186,16 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) { // \brief Erase "Phi" if it is not used any more void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { - if (!Phi->hasNUsesOrMore(1)) - Phi->eraseFromParent(); + if (llvm::RecursivelyDeleteDeadPHINode(Phi)) { + DEBUG(dbgs() << "Erased unused condition phi\n"); + } } /// \brief Open a new "If" block void SIAnnotateControlFlow::openIf(BranchInst *Term) { - if (isUniform(Term)) { + if (isUniform(Term)) return; - } + Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); @@ -233,8 +212,9 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) { } /// \brief Recursively handle the condition leading to a loop -Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, - llvm::Loop *L, BranchInst *Term) { +Value *SIAnnotateControlFlow::handleLoopCondition( + Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term, + SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) { // Only search through PHI nodes which are inside the loop. If we try this // with PHI nodes that are outside of the loop, we end up inserting new PHI @@ -245,7 +225,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) { BasicBlock *Parent = Phi->getParent(); - PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front()); + PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front()); Value *Ret = NewPhi; // Handle all non-constant incoming values first @@ -258,14 +238,14 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, } Phi->setIncomingValue(i, BoolFalse); - Value *PhiArg = handleLoopCondition(Incoming, Broken, L, Term); + Value *PhiArg = handleLoopCondition(Incoming, Broken, L, + Term, LoopPhiConditions); NewPhi->addIncoming(PhiArg, From); } BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { - Value *Incoming = Phi->getIncomingValue(i); if (Incoming != BoolTrue) continue; @@ -295,14 +275,17 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, continue; } } + TerminatorInst *Insert = From->getTerminator(); Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); NewPhi->setIncomingValue(i, PhiArg); } - eraseIfUnused(Phi); + + LoopPhiConditions.push_back(WeakTrackingVH(Phi)); return Ret; + } - } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { + if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { BasicBlock *Parent = Inst->getParent(); Instruction *Insert; if (L->contains(Inst)) { @@ -310,46 +293,55 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, } else { Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime(); } + Value *Args[] = { Cond, Broken }; return CallInst::Create(IfBreak, Args, "", Insert); + } - // Insert IfBreak before TERM for constant COND. - } else if (isa<ConstantInt>(Cond)) { - Value *Args[] = { Cond, Broken }; - return CallInst::Create(IfBreak, Args, "", Term); + // Insert IfBreak in the loop header TERM for constant COND other than true. + if (isa<Constant>(Cond)) { + Instruction *Insert = Cond == BoolTrue ? + Term : L->getHeader()->getTerminator(); - } else { - llvm_unreachable("Unhandled loop condition!"); + Value *Args[] = { Cond, Broken }; + return CallInst::Create(IfBreak, Args, "", Insert); } - return nullptr; + + llvm_unreachable("Unhandled loop condition!"); } /// \brief Handle a back edge (loop) void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { - if (isUniform(Term)) { + if (isUniform(Term)) return; - } BasicBlock *BB = Term->getParent(); llvm::Loop *L = LI->getLoopFor(BB); if (!L) return; + BasicBlock *Target = Term->getSuccessor(1); - PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); + PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front()); + SmallVector<WeakTrackingVH, 8> LoopPhiConditions; Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); - Value *Arg = handleLoopCondition(Cond, Broken, L, Term); + Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions); - for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); - PI != PE; ++PI) { + for (BasicBlock *Pred : predecessors(Target)) + Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred); + + Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); - Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI); + for (WeakTrackingVH Val : reverse(LoopPhiConditions)) { + if (PHINode *Cond = cast_or_null<PHINode>(Val)) + eraseIfUnused(Cond); } - Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); push(Term->getSuccessor(0), Arg); -}/// \brief Close the last opened control flow +} + +/// \brief Close the last opened control flow void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { llvm::Loop *L = LI->getLoopFor(BB); @@ -359,59 +351,62 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { // We can't insert an EndCF call into a loop header, because it will // get executed on every iteration of the loop, when it should be // executed only once before the loop. - SmallVector <BasicBlock*, 8> Latches; + SmallVector <BasicBlock *, 8> Latches; L->getLoopLatches(Latches); - std::vector<BasicBlock*> Preds; - for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { - if (!is_contained(Latches, *PI)) - Preds.push_back(*PI); + SmallVector<BasicBlock *, 2> Preds; + for (BasicBlock *Pred : predecessors(BB)) { + if (!is_contained(Latches, Pred)) + Preds.push_back(Pred); } + BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); } Value *Exec = popSaved(); - if (!isa<UndefValue>(Exec)) - CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt()); + Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt(); + if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) + CallInst::Create(EndCf, Exec, "", FirstInsertionPt); } /// \brief Annotate the control flow with intrinsics so the backend can /// recognize if/then/else and loops. bool SIAnnotateControlFlow::runOnFunction(Function &F) { - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DA = &getAnalysis<DivergenceAnalysis>(); for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { - - BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator()); + BasicBlock *BB = *I; + BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()); if (!Term || Term->isUnconditional()) { - if (isTopOfStack(*I)) - closeControlFlow(*I); + if (isTopOfStack(BB)) + closeControlFlow(BB); continue; } if (I.nodeVisited(Term->getSuccessor(1))) { - if (isTopOfStack(*I)) - closeControlFlow(*I); + if (isTopOfStack(BB)) + closeControlFlow(BB); handleLoop(Term); continue; } - if (isTopOfStack(*I)) { + if (isTopOfStack(BB)) { PHINode *Phi = dyn_cast<PHINode>(Term->getCondition()); - if (Phi && Phi->getParent() == *I && isElse(Phi)) { + if (Phi && Phi->getParent() == BB && isElse(Phi)) { insertElse(Term); eraseIfUnused(Phi); continue; } - closeControlFlow(*I); + + closeControlFlow(BB); } + openIf(Term); } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp index 62ebef8..b5c439b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp @@ -19,8 +19,8 @@ // //===----------------------------------------------------------------------===// -#include "SIInstrInfo.h" #include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" #include "llvm/ADT/DenseSet.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h index ff4e321..3915c0e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h @@ -36,6 +36,7 @@ enum : uint64_t { // TODO: Should this be spilt into VOP3 a and b? VOP3 = 1 << 10, + VOP3P = 1 << 12, VINTRP = 1 << 13, SDWA = 1 << 14, @@ -65,8 +66,8 @@ enum : uint64_t { SOPK_ZEXT = UINT64_C(1) << 38, SCALAR_STORE = UINT64_C(1) << 39, FIXED_SIZE = UINT64_C(1) << 40, - VOPAsmPrefer32Bit = UINT64_C(1) << 41 - + VOPAsmPrefer32Bit = UINT64_C(1) << 41, + HasFPClamp = UINT64_C(1) << 42 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. @@ -102,12 +103,14 @@ namespace AMDGPU { OPERAND_REG_INLINE_C_FP16, OPERAND_REG_INLINE_C_FP32, OPERAND_REG_INLINE_C_FP64, + OPERAND_REG_INLINE_C_V2FP16, + OPERAND_REG_INLINE_C_V2INT16, OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32, OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16, OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16, - OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64, + OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_V2INT16, OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32, OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST, @@ -115,6 +118,10 @@ namespace AMDGPU { // Operand for source modifiers for VOP instructions OPERAND_INPUT_MODS, + // Operand for SDWA instructions + OPERAND_SDWA_SRC, + OPERAND_SDWA_VOPC_DST, + /// Operand with 32-bit immediate that uses the constant bus. OPERAND_KIMM32, OPERAND_KIMM16 @@ -125,9 +132,12 @@ namespace AMDGPU { // NEG and SEXT share same bit-mask because they can't be set simultaneously. namespace SISrcMods { enum { - NEG = 1 << 0, // Floating-point negate modifier - ABS = 1 << 1, // Floating-point absolute modifier - SEXT = 1 << 0 // Integer sign-extend modifier + NEG = 1 << 0, // Floating-point negate modifier + ABS = 1 << 1, // Floating-point absolute modifier + SEXT = 1 << 0, // Integer sign-extend modifier + NEG_HI = ABS, // Floating-point negate high packed component modifier. + OP_SEL_0 = 1 << 2, + OP_SEL_1 = 1 << 3 }; } @@ -154,7 +164,8 @@ namespace AMDGPUAsmVariants { DEFAULT = 0, VOP3 = 1, SDWA = 2, - DPP = 3 + SDWA9 = 3, + DPP = 4 }; } @@ -242,6 +253,7 @@ enum Id { // HwRegCode, (6) [5:0] ID_LDS_ALLOC = 6, ID_IB_STS = 7, ID_SYMBOLIC_LAST_ = 8, + ID_MEM_BASES = 15, ID_SHIFT_ = 0, ID_WIDTH_ = 6, ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) @@ -251,18 +263,64 @@ enum Offset { // Offset, (5) [10:6] OFFSET_DEFAULT_ = 0, OFFSET_SHIFT_ = 6, OFFSET_WIDTH_ = 5, - OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_) + OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_), + + OFFSET_SRC_SHARED_BASE = 16, + OFFSET_SRC_PRIVATE_BASE = 0 }; enum WidthMinusOne { // WidthMinusOne, (5) [15:11] WIDTH_M1_DEFAULT_ = 31, WIDTH_M1_SHIFT_ = 11, WIDTH_M1_WIDTH_ = 5, - WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_) + WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_), + + WIDTH_M1_SRC_SHARED_BASE = 15, + WIDTH_M1_SRC_PRIVATE_BASE = 15 }; } // namespace Hwreg +namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32. + +enum Id { // id of symbolic names + ID_QUAD_PERM = 0, + ID_BITMASK_PERM, + ID_SWAP, + ID_REVERSE, + ID_BROADCAST +}; + +enum EncBits { + + // swizzle mode encodings + + QUAD_PERM_ENC = 0x8000, + QUAD_PERM_ENC_MASK = 0xFF00, + + BITMASK_PERM_ENC = 0x0000, + BITMASK_PERM_ENC_MASK = 0x8000, + + // QUAD_PERM encodings + + LANE_MASK = 0x3, + LANE_MAX = LANE_MASK, + LANE_SHIFT = 2, + LANE_NUM = 4, + + // BITMASK_PERM encodings + + BITMASK_MASK = 0x1F, + BITMASK_MAX = BITMASK_MASK, + BITMASK_WIDTH = 5, + + BITMASK_AND_SHIFT = 0, + BITMASK_OR_SHIFT = 5, + BITMASK_XOR_SHIFT = 10 +}; + +} // namespace Swizzle + namespace SDWA { enum SdwaSel { @@ -281,6 +339,18 @@ enum DstUnused { UNUSED_PRESERVE = 2, }; +enum SDWA9EncValues{ + SRC_SGPR_MASK = 0x100, + SRC_VGPR_MASK = 0xFF, + VOPC_DST_VCC_MASK = 0x80, + VOPC_DST_SGPR_MASK = 0x7F, + + SRC_VGPR_MIN = 0, + SRC_VGPR_MAX = 255, + SRC_SGPR_MIN = 256, + SRC_SGPR_MAX = 357, +}; + } // namespace SDWA } // namespace AMDGPU @@ -289,6 +359,7 @@ enum DstUnused { #define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) #define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128 #define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228 +#define R_00B428_SPI_SHADER_PGM_RSRC1_HS 0x00B428 #define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 #define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) #define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) @@ -300,6 +371,9 @@ enum DstUnused { #define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1) #define G_00B84C_USER_SGPR(x) (((x) >> 1) & 0x1F) #define C_00B84C_USER_SGPR 0xFFFFFFC1 +#define S_00B84C_TRAP_HANDLER(x) (((x) & 0x1) << 6) +#define G_00B84C_TRAP_HANDLER(x) (((x) >> 6) & 0x1) +#define C_00B84C_TRAP_HANDLER 0xFFFFFFBF #define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7) #define G_00B84C_TGID_X_EN(x) (((x) >> 7) & 0x1) #define C_00B84C_TGID_X_EN 0xFFFFFF7F @@ -387,7 +461,6 @@ enum DstUnused { #define R_SPILLED_SGPRS 0x4 #define R_SPILLED_VGPRS 0x8 - } // End namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 6a422e7..0a795c9 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -68,6 +68,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -80,6 +81,11 @@ using namespace llvm; #define DEBUG_TYPE "si-fix-sgpr-copies" +static cl::opt<bool> EnableM0Merge( + "amdgpu-enable-merge-m0", + cl::desc("Merge and hoist M0 initializations"), + cl::init(false)); + namespace { class SIFixSGPRCopies : public MachineFunctionPass { @@ -107,7 +113,7 @@ public: INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) @@ -168,6 +174,31 @@ static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); } +static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, + const SIRegisterInfo *TRI, + const SIInstrInfo *TII) { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + auto &Src = MI.getOperand(1); + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = Src.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + !TargetRegisterInfo::isVirtualRegister(DstReg)) + return false; + + for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { + const auto *UseMI = MO.getParent(); + if (UseMI == &MI) + continue; + if (MO.isDef() || UseMI->getParent() != MI.getParent() || + UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END || + !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src)) + return false; + } + // Change VGPR to SGPR destination. + MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg))); + return true; +} + // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. // // SGPRx = ... @@ -198,12 +229,19 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, if (!CopyUse.isCopy()) return false; + // It is illegal to have vreg inputs to a physreg defining reg_sequence. + if (TargetRegisterInfo::isPhysicalRegister(CopyUse.getOperand(0).getReg())) + return false; + const TargetRegisterClass *SrcRC, *DstRC; std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) return false; + if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII)) + return true; + // TODO: Could have multiple extracts? unsigned SubReg = CopyUse.getOperand(1).getSubReg(); if (SubReg != AMDGPU::NoSubRegister) @@ -234,8 +272,9 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); - BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) - .addOperand(MI.getOperand(I)); + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), + TmpReg) + .add(MI.getOperand(I)); MI.getOperand(I).setReg(TmpReg); } @@ -267,8 +306,7 @@ static bool phiHasBreakDef(const MachineInstr &PHI, Visited.insert(Reg); - MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); - assert(DefInstr); + MachineInstr *DefInstr = MRI.getVRegDef(Reg); switch (DefInstr->getOpcode()) { default: break; @@ -326,6 +364,186 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, return true; } +template <class UnaryPredicate> +bool searchPredecessors(const MachineBasicBlock *MBB, + const MachineBasicBlock *CutOff, + UnaryPredicate Predicate) { + + if (MBB == CutOff) + return false; + + DenseSet<const MachineBasicBlock*> Visited; + SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(), + MBB->pred_end()); + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.pop_back_val(); + + if (!Visited.insert(MBB).second) + continue; + if (MBB == CutOff) + continue; + if (Predicate(MBB)) + return true; + + Worklist.append(MBB->pred_begin(), MBB->pred_end()); + } + + return false; +} + +static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, + const TargetRegisterInfo *TRI) { + return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) { + return hasTerminatorThatModifiesExec(*MBB, *TRI); }); +} + +// Checks if there is potential path From instruction To instruction. +// If CutOff is specified and it sits in between of that path we ignore +// a higher portion of the path and report it is not reachable. +static bool isReachable(const MachineInstr *From, + const MachineInstr *To, + const MachineBasicBlock *CutOff, + MachineDominatorTree &MDT) { + // If either From block dominates To block or instructions are in the same + // block and From is higher. + if (MDT.dominates(From, To)) + return true; + + const MachineBasicBlock *MBBFrom = From->getParent(); + const MachineBasicBlock *MBBTo = To->getParent(); + if (MBBFrom == MBBTo) + return false; + + // Instructions are in different blocks, do predecessor search. + // We should almost never get here since we do not usually produce M0 stores + // other than -1. + return searchPredecessors(MBBTo, CutOff, [MBBFrom] + (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); +} + +// Hoist and merge identical SGPR initializations into a common predecessor. +// This is intended to combine M0 initializations, but can work with any +// SGPR. A VGPR cannot be processed since we cannot guarantee vector +// executioon. +static bool hoistAndMergeSGPRInits(unsigned Reg, + const MachineRegisterInfo &MRI, + MachineDominatorTree &MDT) { + // List of inits by immediate value. + typedef std::map<unsigned, std::list<MachineInstr*>> InitListMap; + InitListMap Inits; + // List of clobbering instructions. + SmallVector<MachineInstr*, 8> Clobbers; + bool Changed = false; + + for (auto &MI : MRI.def_instructions(Reg)) { + MachineOperand *Imm = nullptr; + for (auto &MO: MI.operands()) { + if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || + (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { + Imm = nullptr; + break; + } else if (MO.isImm()) + Imm = &MO; + } + if (Imm) + Inits[Imm->getImm()].push_front(&MI); + else + Clobbers.push_back(&MI); + } + + for (auto &Init : Inits) { + auto &Defs = Init.second; + + for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { + MachineInstr *MI1 = *I1; + + for (auto I2 = std::next(I1); I2 != E; ) { + MachineInstr *MI2 = *I2; + + // Check any possible interference + auto intereferes = [&](MachineBasicBlock::iterator From, + MachineBasicBlock::iterator To) -> bool { + + assert(MDT.dominates(&*To, &*From)); + + auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { + const MachineBasicBlock *MBBFrom = From->getParent(); + const MachineBasicBlock *MBBTo = To->getParent(); + bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT); + bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT); + if (!MayClobberFrom && !MayClobberTo) + return false; + if ((MayClobberFrom && !MayClobberTo) || + (!MayClobberFrom && MayClobberTo)) + return true; + // Both can clobber, this is not an interference only if both are + // dominated by Clobber and belong to the same block or if Clobber + // properly dominates To, given that To >> From, so it dominates + // both and located in a common dominator. + return !((MBBFrom == MBBTo && + MDT.dominates(Clobber, &*From) && + MDT.dominates(Clobber, &*To)) || + MDT.properlyDominates(Clobber->getParent(), MBBTo)); + }; + + return (any_of(Clobbers, interferes)) || + (any_of(Inits, [&](InitListMap::value_type &C) { + return C.first != Init.first && any_of(C.second, interferes); + })); + }; + + if (MDT.dominates(MI1, MI2)) { + if (!intereferes(MI2, MI1)) { + DEBUG(dbgs() << "Erasing from BB#" << MI2->getParent()->getNumber() + << " " << *MI2); + MI2->eraseFromParent(); + Defs.erase(I2++); + Changed = true; + continue; + } + } else if (MDT.dominates(MI2, MI1)) { + if (!intereferes(MI1, MI2)) { + DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber() + << " " << *MI1); + MI1->eraseFromParent(); + Defs.erase(I1++); + Changed = true; + break; + } + } else { + auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(), + MI2->getParent()); + if (!MBB) { + ++I2; + continue; + } + + MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); + if (!intereferes(MI1, I) && !intereferes(MI2, I)) { + DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber() + << " " << *MI1 << "and moving from BB#" + << MI2->getParent()->getNumber() << " to BB#" + << I->getParent()->getNumber() << " " << *MI2); + I->getParent()->splice(I, MI2->getParent(), MI2); + MI1->eraseFromParent(); + Defs.erase(I1++); + Changed = true; + break; + } + } + ++I2; + } + ++I1; + } + } + + if (Changed) + MRI.clearKillFlags(Reg); + + return Changed; +} + bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -355,7 +573,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const TargetRegisterClass *SrcRC, *DstRC; std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { - MachineInstr *DefMI = MRI.getVRegDef(MI.getOperand(1).getReg()); + unsigned SrcReg = MI.getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) { + TII->moveToVALU(MI); + break; + } + + MachineInstr *DefMI = MRI.getVRegDef(SrcReg); unsigned SMovOp; int64_t Imm; // If we are just copying an immediate, we can replace the copy with @@ -367,6 +591,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { break; } TII->moveToVALU(MI); + } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { + tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); } break; @@ -382,8 +608,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); - MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1); - if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) { + if (!predsHasDivergentTerminator(MBB0, TRI) && + !predsHasDivergentTerminator(MBB1, TRI)) { DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n'); break; } @@ -458,5 +684,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } } + if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) + hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT); + return true; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp new file mode 100644 index 0000000..3d31217 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp @@ -0,0 +1,72 @@ +//===-- SIFixVGPRCopies.cpp - Fix VGPR Copies after regalloc --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Add implicit use of exec to vector register copies. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-vgpr-copies" + +namespace { + +class SIFixVGPRCopies : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixVGPRCopies() : MachineFunctionPass(ID) { + initializeSIFixVGPRCopiesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "SI Fix VGPR copies"; } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIFixVGPRCopies, DEBUG_TYPE, "SI Fix VGPR copies", false, false) + +char SIFixVGPRCopies::ID = 0; + +char &llvm::SIFixVGPRCopiesID = SIFixVGPRCopies::ID; + +bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + if (TII->isVGPRCopy(MI) && !MI.readsRegister(AMDGPU::EXEC, TRI)) { + MI.addOperand(MF, + MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + DEBUG(dbgs() << "Add exec use to " << MI); + Changed = true; + } + break; + default: + break; + } + } + } + + return Changed; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index a5c0d49..0aad8f0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -12,6 +12,8 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -34,9 +36,12 @@ struct FoldCandidate { }; unsigned char UseOpNo; MachineOperand::MachineOperandType Kind; + bool Commuted; - FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : - UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) { + FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, + bool Commuted_ = false) : + UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()), + Commuted(Commuted_) { if (FoldOp->isImm()) { ImmToFold = FoldOp->getImm(); } else if (FoldOp->isFI()) { @@ -58,6 +63,10 @@ struct FoldCandidate { bool isReg() const { return Kind == MachineOperand::MO_Register; } + + bool isCommuted() const { + return Commuted; + } }; class SIFoldOperands : public MachineFunctionPass { @@ -66,6 +75,7 @@ public: MachineRegisterInfo *MRI; const SIInstrInfo *TII; const SIRegisterInfo *TRI; + const SISubtarget *ST; void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, @@ -75,6 +85,12 @@ public: void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; + const MachineOperand *isClamp(const MachineInstr &MI) const; + bool tryFoldClamp(MachineInstr &MI); + + std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const; + bool tryFoldOMod(MachineInstr &MI); + public: SIFoldOperands() : MachineFunctionPass(ID) { initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); @@ -121,6 +137,7 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII, = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); } + return false; } default: return false; @@ -131,27 +148,6 @@ FunctionPass *llvm::createSIFoldOperandsPass() { return new SIFoldOperands(); } -static bool isSafeToFold(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - case AMDGPU::V_MOV_B64_PSEUDO: { - // If there are additional implicit register operands, this may be used for - // register indexing so the source register operand isn't simply copied. - unsigned NumOps = MI.getDesc().getNumOperands() + - MI.getDesc().getNumImplicitUses(); - - return MI.getNumOperands() == NumOps; - } - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::COPY: - return true; - default: - return false; - } -} - static bool updateOperand(FoldCandidate &Fold, const TargetRegisterInfo &TRI) { MachineInstr *MI = Fold.UseMI; @@ -172,6 +168,8 @@ static bool updateOperand(FoldCandidate &Fold, if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && TargetRegisterInfo::isVirtualRegister(New->getReg())) { Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); + + Old.setIsUndef(New->isUndef()); return true; } @@ -250,8 +248,13 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) return false; - if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) + if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { + TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); return false; + } + + FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true)); + return true; } FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); @@ -260,9 +263,10 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, // If the use operand doesn't care about the value, this may be an operand only // used for register indexing, in which case it is unsafe to fold. -static bool isUseSafeToFold(const MachineInstr &MI, +static bool isUseSafeToFold(const SIInstrInfo *TII, + const MachineInstr &MI, const MachineOperand &UseMO) { - return !UseMO.isUndef(); + return !UseMO.isUndef() && !TII->isSDWA(MI); //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } @@ -274,7 +278,7 @@ void SIFoldOperands::foldOperand( SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); - if (!isUseSafeToFold(*UseMI, UseOp)) + if (!isUseSafeToFold(TII, *UseMI, UseOp)) return; // FIXME: Fold operands with subregs. @@ -359,8 +363,6 @@ void SIFoldOperands::foldOperand( const TargetRegisterClass *FoldRC = TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); - APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType), - OpToFold.getImm()); // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { @@ -370,21 +372,25 @@ void SIFoldOperands::foldOperand( MRI->getRegClass(UseReg) : TRI->getPhysRegClass(UseReg); - assert(Imm.getBitWidth() == 64); - if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) return; + APInt Imm(64, OpToFold.getImm()); if (UseOp.getSubReg() == AMDGPU::sub0) { Imm = Imm.getLoBits(32); } else { assert(UseOp.getSubReg() == AMDGPU::sub1); Imm = Imm.getHiBits(32); } + + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + return; } - MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + + + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); } static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, @@ -468,7 +474,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, return &Op; MachineInstr *Def = MRI.getVRegDef(Op.getReg()); - if (Def->isMoveImmediate()) { + if (Def && Def->isMoveImmediate()) { MachineOperand &ImmSrc = Def->getOperand(1); if (ImmSrc.isImm()) return &ImmSrc; @@ -581,6 +587,32 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, return false; } +// Try to fold an instruction into a simpler one +static bool tryFoldInst(const SIInstrInfo *TII, + MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + + if (Opc == AMDGPU::V_CNDMASK_B32_e32 || + Opc == AMDGPU::V_CNDMASK_B32_e64 || + Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) { + const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1); + if (Src1->isIdenticalTo(*Src0)) { + DEBUG(dbgs() << "Folded " << *MI << " into "); + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (Src2Idx != -1) + MI->RemoveOperand(Src2Idx); + MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); + mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY + : getMovOpc(false))); + DEBUG(dbgs() << *MI << '\n'); + return true; + } + } + + return false; +} + void SIFoldOperands::foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const { // We need mutate the operands of new mov instructions to add implicit @@ -621,6 +653,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, // again. The same constant folded instruction could also have a second // use operand. NextUse = MRI->use_begin(Dst.getReg()); + FoldList.clear(); continue; } @@ -682,31 +715,230 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); + tryFoldInst(TII, Fold.UseMI); + } else if (Fold.isCommuted()) { + // Restoring instruction's original operand order if fold has failed. + TII->commuteInstruction(*Fold.UseMI, false); } } } +const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { + unsigned Op = MI.getOpcode(); + switch (Op) { + case AMDGPU::V_MAX_F32_e64: + case AMDGPU::V_MAX_F16_e64: + case AMDGPU::V_MAX_F64: { + if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) + return nullptr; + + // Make sure sources are identical. + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (!Src0->isReg() || !Src1->isReg() || + Src0->getSubReg() != Src1->getSubReg() || + Src0->getSubReg() != AMDGPU::NoSubRegister) + return nullptr; + + // Can't fold up if we have modifiers. + if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return nullptr; + return Src0; + } + default: + return nullptr; + } +} + +// We obviously have multiple uses in a clamp since the register is used twice +// in the same instruction. +static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) { + int Count = 0; + for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end(); + I != E; ++I) { + if (++Count > 1) + return false; + } + + return true; +} + +bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { + const MachineOperand *ClampSrc = isClamp(MI); + if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) + return false; + + MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); + if (!TII->hasFPClamp(*Def)) + return false; + MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); + if (!DefClamp) + return false; + + DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n'); + + // Clamp is applied after omod, so it is OK if omod is set. + DefClamp->setImm(1); + MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); + MI.eraseFromParent(); + return true; +} + +static int getOModValue(unsigned Opc, int64_t Val) { + switch (Opc) { + case AMDGPU::V_MUL_F32_e64: { + switch (static_cast<uint32_t>(Val)) { + case 0x3f000000: // 0.5 + return SIOutMods::DIV2; + case 0x40000000: // 2.0 + return SIOutMods::MUL2; + case 0x40800000: // 4.0 + return SIOutMods::MUL4; + default: + return SIOutMods::NONE; + } + } + case AMDGPU::V_MUL_F16_e64: { + switch (static_cast<uint16_t>(Val)) { + case 0x3800: // 0.5 + return SIOutMods::DIV2; + case 0x4000: // 2.0 + return SIOutMods::MUL2; + case 0x4400: // 4.0 + return SIOutMods::MUL4; + default: + return SIOutMods::NONE; + } + } + default: + llvm_unreachable("invalid mul opcode"); + } +} + +// FIXME: Does this really not support denormals with f16? +// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not +// handled, so will anything other than that break? +std::pair<const MachineOperand *, int> +SIFoldOperands::isOMod(const MachineInstr &MI) const { + unsigned Op = MI.getOpcode(); + switch (Op) { + case AMDGPU::V_MUL_F32_e64: + case AMDGPU::V_MUL_F16_e64: { + // If output denormals are enabled, omod is ignored. + if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) || + (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals())) + return std::make_pair(nullptr, SIOutMods::NONE); + + const MachineOperand *RegOp = nullptr; + const MachineOperand *ImmOp = nullptr; + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src0->isImm()) { + ImmOp = Src0; + RegOp = Src1; + } else if (Src1->isImm()) { + ImmOp = Src1; + RegOp = Src0; + } else + return std::make_pair(nullptr, SIOutMods::NONE); + + int OMod = getOModValue(Op, ImmOp->getImm()); + if (OMod == SIOutMods::NONE || + TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::omod) || + TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return std::make_pair(nullptr, SIOutMods::NONE); + + return std::make_pair(RegOp, OMod); + } + case AMDGPU::V_ADD_F32_e64: + case AMDGPU::V_ADD_F16_e64: { + // If output denormals are enabled, omod is ignored. + if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) || + (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals())) + return std::make_pair(nullptr, SIOutMods::NONE); + + // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + + if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() && + Src0->getSubReg() == Src1->getSubReg() && + !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return std::make_pair(Src0, SIOutMods::MUL2); + + return std::make_pair(nullptr, SIOutMods::NONE); + } + default: + return std::make_pair(nullptr, SIOutMods::NONE); + } +} + +// FIXME: Does this need to check IEEE bit on function? +bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { + const MachineOperand *RegOp; + int OMod; + std::tie(RegOp, OMod) = isOMod(MI); + if (OMod == SIOutMods::NONE || !RegOp->isReg() || + RegOp->getSubReg() != AMDGPU::NoSubRegister || + !hasOneNonDBGUseInst(*MRI, RegOp->getReg())) + return false; + + MachineInstr *Def = MRI->getVRegDef(RegOp->getReg()); + MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod); + if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) + return false; + + // Clamp is applied after omod. If the source already has clamp set, don't + // fold it. + if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) + return false; + + DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n'); + + DefOMod->setImm(OMod); + MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); + MI.eraseFromParent(); + return true; +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - MRI = &MF.getRegInfo(); - TII = ST.getInstrInfo(); + ST = &MF.getSubtarget<SISubtarget>(); + TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // omod is ignored by hardware if IEEE bit is enabled. omod also does not + // correctly handle signed zeros. + // + // TODO: Check nsz on instructions when fast math flags are preserved to MI + // level. + bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath(); - MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock *MBB : depth_first(&MF)) { MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { + for (I = MBB->begin(); I != MBB->end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; - if (!isSafeToFold(MI)) + tryFoldInst(TII, &MI); + + if (!TII->isFoldableCopy(MI)) { + if (IsIEEEMode || !tryFoldOMod(MI)) + tryFoldClamp(MI); continue; + } MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 0b57155..7334781 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -8,10 +8,10 @@ //==-----------------------------------------------------------------------===// #include "SIFrameLowering.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -21,22 +21,24 @@ using namespace llvm; -static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF, - const SIRegisterInfo *TRI) { +static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST, + const MachineFunction &MF) { return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), - TRI->getMaxNumSGPRs(MF) / 4); + ST.getMaxNumSGPRs(MF) / 4); } -static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF, - const SIRegisterInfo *TRI) { +static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST, + const MachineFunction &MF) { return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), - TRI->getMaxNumSGPRs(MF)); + ST.getMaxNumSGPRs(MF)); } -void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII, - const SIRegisterInfo* TRI, +void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const { + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo* TRI = &TII->getRegisterInfo(); + // We don't need this if we only have spills since there is no user facing // scratch. @@ -59,16 +61,28 @@ void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII, MRI.addLiveIn(FlatScratchInitReg); MBB.addLiveIn(FlatScratchInitReg); - // Copy the size in bytes. - unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) - .addReg(FlatScrInitHi, RegState::Kill); - unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + // Do a 64-bit pointer add. + if (ST.flatScratchIsPointer()) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) + .addReg(FlatScrInitHi) + .addImm(0); + + return; + } + + // Copy the size in bytes. + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitHi, RegState::Kill); + // Add wave offset in bytes to private base offset. // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) @@ -87,10 +101,12 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( const SIRegisterInfo *TRI, SIMachineFunctionInfo *MFI, MachineFunction &MF) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); // We need to insert initialization of the scratch resource descriptor. unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); - if (ScratchRsrcReg == AMDGPU::NoRegister) + if (ScratchRsrcReg == AMDGPU::NoRegister || + !MRI.isPhysRegUsed(ScratchRsrcReg)) return AMDGPU::NoRegister; if (ST.hasSGPRInitBug() || @@ -108,19 +124,16 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( // We find the resource first because it has an alignment requirement. - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; - ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI); + ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF); AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); - // Skip the last 2 elements because the last one is reserved for VCC, and - // this is the 2nd to last element already. + // Skip the last N reserved elements because they should have already been + // reserved for VCC etc. for (MCPhysReg Reg : AllSGPR128s) { // Pick the first unallocated one. Make sure we don't clobber the other // reserved input we needed. if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { - //assert(MRI.isAllocatable(Reg)); MRI.replaceRegWith(ScratchRsrcReg, Reg); MFI->setScratchRSrcReg(Reg); return Reg; @@ -130,25 +143,34 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( return ScratchRsrcReg; } -unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( +// Shift down registers reserved for the scratch wave offset and stack pointer +// SGPRs. +std::pair<unsigned, unsigned> +SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( const SISubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, SIMachineFunctionInfo *MFI, MachineFunction &MF) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - if (ST.hasSGPRInitBug() || - ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) - return ScratchWaveOffsetReg; - unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); - MachineRegisterInfo &MRI = MF.getRegInfo(); + // No replacement necessary. + if (ScratchWaveOffsetReg == AMDGPU::NoRegister || + !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) { + assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG); + return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister); + } + + unsigned SPReg = MFI->getStackPtrOffsetReg(); + if (ST.hasSGPRInitBug()) + return std::make_pair(ScratchWaveOffsetReg, SPReg); unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI); + ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); if (NumPreloaded > AllSGPRs.size()) - return ScratchWaveOffsetReg; + return std::make_pair(ScratchWaveOffsetReg, SPReg); AllSGPRs = AllSGPRs.slice(NumPreloaded); @@ -163,33 +185,41 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( // register from the list to consider, it means that when this // register is being used for the scratch wave offset and there // are no other free SGPRs, then the value will stay in this register. + // + 1 if stack pointer is used. // ---- - // 13 - if (AllSGPRs.size() < 13) - return ScratchWaveOffsetReg; + // 13 (+1) + unsigned ReservedRegCount = 13; + + if (AllSGPRs.size() < ReservedRegCount) + return std::make_pair(ScratchWaveOffsetReg, SPReg); + + bool HandledScratchWaveOffsetReg = + ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); - for (MCPhysReg Reg : AllSGPRs.drop_back(13)) { + for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { // Pick the first unallocated SGPR. Be careful not to pick an alias of the // scratch descriptor, since we haven’t added its uses yet. - if (!MRI.isPhysRegUsed(Reg)) { - if (!MRI.isAllocatable(Reg) || - TRI->isSubRegisterEq(ScratchRsrcReg, Reg)) - continue; + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { + if (!HandledScratchWaveOffsetReg) { + HandledScratchWaveOffsetReg = true; - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - MFI->setScratchWaveOffsetReg(Reg); - return Reg; + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + MFI->setScratchWaveOffsetReg(Reg); + ScratchWaveOffsetReg = Reg; + break; + } } } - return ScratchWaveOffsetReg; + return std::make_pair(ScratchWaveOffsetReg, SPReg); } -void SIFrameLowering::emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const { +void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was // specified. const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + auto AMDGPUASI = ST.getAMDGPUAS(); if (ST.debuggerEmitPrologue()) emitDebuggerPrologue(MF, MBB); @@ -207,18 +237,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned ScratchRsrcReg - = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); - unsigned ScratchWaveOffsetReg - = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); - - if (ScratchRsrcReg == AMDGPU::NoRegister) { - assert(ScratchWaveOffsetReg == AMDGPU::NoRegister); - return; - } - - assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); - // We need to do the replacement of the private segment buffer and wave offset // register even if there are no stack objects. There could be stores to undef // or a constant without an associated object. @@ -228,22 +246,55 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // this point it appears we need the setup. This part of the prolog should be // emitted after frame indices are eliminated. - if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) - emitFlatScratchInit(TII, TRI, MF, MBB); + if (MFI->hasFlatScratchInit()) + emitFlatScratchInit(ST, MF, MBB); + + unsigned SPReg = MFI->getStackPtrOffsetReg(); + if (SPReg != AMDGPU::SP_REG) { + assert(MRI.isReserved(SPReg) && "SPReg used but not reserved"); + + DebugLoc DL; + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + int64_t StackSize = FrameInfo.getStackSize(); + + if (StackSize == 0) { + BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } else { + BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(StackSize * ST.getWavefrontSize()); + } + } + + unsigned ScratchRsrcReg + = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); + + unsigned ScratchWaveOffsetReg; + std::tie(ScratchWaveOffsetReg, SPReg) + = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); + + // It's possible to have uses of only ScratchWaveOffsetReg without + // ScratchRsrcReg if it's only used for the initialization of flat_scratch, + // but the inverse is not true. + if (ScratchWaveOffsetReg == AMDGPU::NoRegister) { + assert(ScratchRsrcReg == AMDGPU::NoRegister); + return; + } // We need to insert initialization of the scratch resource descriptor. unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) { + if (ST.isAmdCodeObjectV2(MF)) { PreloadedPrivateBufferReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } - bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg); - bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg); + bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg); + bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && + MRI.isPhysRegUsed(ScratchRsrcReg); // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. @@ -296,7 +347,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (OffsetRegUsed && PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + .addReg(PreloadedScratchWaveOffsetReg, + MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill); } if (CopyBuffer && !CopyBufferFirst) { @@ -314,21 +366,21 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // Use relocations to get the pointer, and setup the other bits manually. uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - if (MFI->hasPrivateMemoryInputPtr()) { + if (MFI->hasImplicitBufferPtr()) { unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); BuildMI(MBB, I, DL, Mov64, Rsrc01) - .addReg(PreloadedPrivateBufferReg) + .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } else { const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); PointerType *PtrTy = PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()), - AMDGPUAS::CONSTANT_ADDRESS); + AMDGPUASI.CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); auto MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | @@ -336,7 +388,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineMemOperand::MODereferenceable, 0, 0); BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) - .addReg(PreloadedPrivateBufferReg) + .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addImm(0) // offset .addImm(0) // glc .addMemOperand(MMO) @@ -366,9 +418,89 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, } } +void SIFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + if (FuncInfo->isEntryFunction()) { + emitEntryFunctionPrologue(MF, MBB); + return; + } + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + unsigned FramePtrReg = FuncInfo->getFrameOffsetReg(); + + MachineBasicBlock::iterator MBBI = MBB.begin(); + DebugLoc DL; + + bool NeedFP = hasFP(MF); + if (NeedFP) { + // If we need a base pointer, set it up here. It's whatever the value of + // the stack pointer is at this point. Any variable size objects will be + // allocated after this, so we can still use the base pointer to reference + // locals. + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) + .addReg(StackPtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + uint32_t NumBytes = MFI.getStackSize(); + if (NumBytes != 0 && hasSP(MF)) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) + .addReg(StackPtrReg) + .addImm(NumBytes * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); + } +} + void SIFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { + const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + if (FuncInfo->isEntryFunction()) + return; + + unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + if (StackPtrReg == AMDGPU::NoRegister) + return; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + uint32_t NumBytes = MFI.getStackSize(); + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc DL; + + // FIXME: Clarify distinction between no set SP and SP. For callee functions, + // it's really whether we need SP to be accurate or not. + if (NumBytes != 0 && hasSP(MF)) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) + .addReg(StackPtrReg) + .addImm(NumBytes * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameDestroy); + } +} + +static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { + for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); + I != E; ++I) { + if (!MFI.isDeadObjectIndex(I)) + return false; + } + + return true; +} + +int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo(); + + FrameReg = RI->getFrameRegister(MF); + return MF.getFrameInfo().getObjectOffset(FI); } void SIFrameLowering::processFunctionBeforeFrameFinalized( @@ -379,15 +511,66 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (!MFI.hasStackObjects()) return; - bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + bool AllSGPRSpilledToVGPRs = false; + + if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) { + AllSGPRSpilledToVGPRs = true; + + // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs + // are spilled to VGPRs, in which case we can eliminate the stack usage. + // + // XXX - This operates under the assumption that only other SGPR spills are + // users of the frame index. I'm not 100% sure this is correct. The + // StackColoring pass has a comment saying a future improvement would be to + // merging of allocas with spill slots, but for now according to + // MachineFrameInfo isSpillSlot can't alias any other object. + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (TII->isSGPRSpill(MI)) { + int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); + if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { + bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); + (void)Spilled; + assert(Spilled && "failed to spill SGPR to VGPR when allocated"); + } else + AllSGPRSpilledToVGPRs = false; + } + } + } - assert((RS || !MayNeedScavengingEmergencySlot) && - "RegScavenger required if spilling"); + FuncInfo->removeSGPRToVGPRFrameIndices(MFI); + } - if (MayNeedScavengingEmergencySlot) { - int ScavengeFI = MFI.CreateStackObject( - AMDGPU::SGPR_32RegClass.getSize(), - AMDGPU::SGPR_32RegClass.getAlignment(), false); + // FIXME: The other checks should be redundant with allStackObjectsAreDead, + // but currently hasNonSpillStackObjects is set only from source + // allocas. Stack temps produced from legalization are not counted currently. + if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() || + !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) { + assert(RS && "RegScavenger required if spilling"); + + // We force this to be at offset 0 so no user object ever has 0 as an + // address, so we may use 0 as an invalid pointer value. This is because + // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca + // is required to be address space 0, we are forced to accept this for + // now. Ideally we could have the stack in another address space with 0 as a + // valid pointer, and -1 as the null value. + // + // This will also waste additional space when user stack objects require > 4 + // byte alignment. + // + // The main cost here is losing the offset for addressing modes. However + // this also ensures we shouldn't need a register for the offset when + // emergency scavenging. + int ScavengeFI = MFI.CreateFixedObject( + TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); RS->addScavengingFrameIndex(ScavengeFI); } } @@ -432,3 +615,19 @@ void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); } } + +bool SIFrameLowering::hasFP(const MachineFunction &MF) const { + // All stack operations are relative to the frame offset SGPR. + // TODO: Still want to eliminate sometimes. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + // XXX - Is this only called after frame is finalized? Should be able to check + // frame size. + return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI); +} + +bool SIFrameLowering::hasSP(const MachineFunction &MF) const { + // All stack operations are relative to the frame offset SGPR. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + return MFI.hasCalls() || MFI.hasVarSizedObjects(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 7657b4e..d4dfa1c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -26,18 +26,21 @@ public: AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} ~SIFrameLowering() override = default; + void emitEntryFunctionPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const; void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; void processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS = nullptr) const override; private: - void emitFlatScratchInit(const SIInstrInfo *TII, - const SIRegisterInfo* TRI, + void emitFlatScratchInit(const SISubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const; @@ -48,7 +51,7 @@ private: SIMachineFunctionInfo *MFI, MachineFunction &MF) const; - unsigned getReservedPrivateSegmentWaveByteOffsetReg( + std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg( const SISubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, @@ -57,6 +60,10 @@ private: /// \brief Emits debugger prologue. void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const; + +public: + bool hasFP(const MachineFunction &MF) const override; + bool hasSP(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b98f9f4..2356405f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15,26 +15,71 @@ #ifdef _MSC_VER // Provide M_PI. #define _USE_MATH_DEFINES -#include <cmath> #endif +#include "SIISelLowering.h" #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" #include "SIDefines.h" -#include "SIISelLowering.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/DAGCombine.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetCallingConv.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include <cassert> +#include <cmath> +#include <cstdint> +#include <iterator> +#include <tuple> +#include <utility> +#include <vector> using namespace llvm; @@ -43,7 +88,6 @@ static cl::opt<bool> EnableVGPRIndexMode( cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); - static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { @@ -84,6 +128,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); } + if (Subtarget->hasVOP3PInsts()) { + addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); + } + computeRegisterProperties(STI.getRegisterInfo()); // We need to custom lower vector stores from local memory @@ -110,7 +159,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); @@ -142,10 +190,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Expand); @@ -153,9 +208,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::f32, Expand); setOperationAction(ISD::BR_CC, MVT::f64, Expand); + setOperationAction(ISD::UADDO, MVT::i32, Legal); + setOperationAction(ISD::USUBO, MVT::i32, Legal); + + setOperationAction(ISD::ADDCARRY, MVT::i32, Legal); + setOperationAction(ISD::SUBCARRY, MVT::i32, Legal); + // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, + MVT::v2i64, MVT::v2f64}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -202,6 +264,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + // Avoid stack access for these. + // TODO: Generalize to more vector types. + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, // and output demarshalling setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); @@ -223,6 +292,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // On SI this is s_memtime and s_memrealtime on VI. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); setOperationAction(ISD::TRAP, MVT::Other, Custom); + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom); setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); @@ -303,6 +373,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote); + setOperationAction(ISD::FROUND, MVT::f16, Custom); // F16 - VOP2 Actions. setOperationAction(ISD::BR_CC, MVT::f16, Expand); @@ -317,6 +388,96 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMAD, MVT::f16, Legal); } + if (Subtarget->hasVOP3PInsts()) { + for (MVT VT : {MVT::v2i16, MVT::v2f16}) { + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { + switch (Op) { + case ISD::LOAD: + case ISD::STORE: + case ISD::BUILD_VECTOR: + case ISD::BITCAST: + case ISD::EXTRACT_VECTOR_ELT: + case ISD::INSERT_VECTOR_ELT: + case ISD::INSERT_SUBVECTOR: + case ISD::EXTRACT_SUBVECTOR: + case ISD::SCALAR_TO_VECTOR: + break; + case ISD::CONCAT_VECTORS: + setOperationAction(Op, VT, Custom); + break; + default: + setOperationAction(Op, VT, Expand); + break; + } + } + } + + // XXX - Do these do anything? Vector constants turn into build_vector. + setOperationAction(ISD::Constant, MVT::v2i16, Legal); + setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal); + + setOperationAction(ISD::STORE, MVT::v2i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); + setOperationAction(ISD::STORE, MVT::v2f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32); + + setOperationAction(ISD::LOAD, MVT::v2i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32); + setOperationAction(ISD::LOAD, MVT::v2f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32); + + setOperationAction(ISD::AND, MVT::v2i16, Promote); + AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32); + setOperationAction(ISD::OR, MVT::v2i16, Promote); + AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); + setOperationAction(ISD::XOR, MVT::v2i16, Promote); + AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); + setOperationAction(ISD::SELECT, MVT::v2i16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); + setOperationAction(ISD::SELECT, MVT::v2f16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); + + setOperationAction(ISD::ADD, MVT::v2i16, Legal); + setOperationAction(ISD::SUB, MVT::v2i16, Legal); + setOperationAction(ISD::MUL, MVT::v2i16, Legal); + setOperationAction(ISD::SHL, MVT::v2i16, Legal); + setOperationAction(ISD::SRL, MVT::v2i16, Legal); + setOperationAction(ISD::SRA, MVT::v2i16, Legal); + setOperationAction(ISD::SMIN, MVT::v2i16, Legal); + setOperationAction(ISD::UMIN, MVT::v2i16, Legal); + setOperationAction(ISD::SMAX, MVT::v2i16, Legal); + setOperationAction(ISD::UMAX, MVT::v2i16, Legal); + + setOperationAction(ISD::FADD, MVT::v2f16, Legal); + setOperationAction(ISD::FNEG, MVT::v2f16, Legal); + setOperationAction(ISD::FMUL, MVT::v2f16, Legal); + setOperationAction(ISD::FMA, MVT::v2f16, Legal); + setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal); + + // This isn't really legal, but this avoids the legalizer unrolling it (and + // allows matching fneg (fabs x) patterns) + setOperationAction(ISD::FABS, MVT::v2f16, Legal); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + } else { + setOperationAction(ISD::SELECT, MVT::v2i16, Custom); + setOperationAction(ISD::SELECT, MVT::v2f16, Custom); + } + + for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) { + setOperationAction(ISD::SELECT, VT, Custom); + } + + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::ADDCARRY); + setTargetDAGCombine(ISD::SUB); + setTargetDAGCombine(ISD::SUBCARRY); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -332,6 +493,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::FCANONICALIZE); + setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -364,36 +528,63 @@ const SISubtarget *SITargetLowering::getSubtarget() const { // TargetLowering queries //===----------------------------------------------------------------------===// +bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, + EVT) const { + // SI has some legal vector types, but no legal vector operations. Say no + // shuffles are legal in order to prefer scalarizing some vector operations. + return false; +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, unsigned IntrID) const { switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_dec: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); Info.align = 0; - Info.vol = false; + + const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); + Info.vol = !Vol || !Vol->isZero(); Info.readMem = true; Info.writeMem = true; return true; + } default: return false; } } -bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, - EVT) const { - // SI has some legal vector types, but no legal vector operations. Say no - // shuffles are legal in order to prefer scalarizing some vector operations. - return false; +bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, + SmallVectorImpl<Value*> &Ops, + Type *&AccessTy) const { + switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: { + Value *Ptr = II->getArgOperand(0); + AccessTy = II->getType(); + Ops.push_back(Ptr); + return true; + } + default: + return false; + } } bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { - // Flat instructions do not have offsets, and only have the register - // address. - return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); + if (!Subtarget->hasFlatInstOffsets()) { + // Flat instructions do not have offsets, and only have the register + // address. + return AM.BaseOffs == 0 && AM.Scale == 0; + } + + // GFX9 added a 13-bit signed offset. When using regular flat instructions, + // the sign bit is ignored and is treated as a 12-bit unsigned offset. + + // Just r + i + return isUInt<12>(AM.BaseOffs) && AM.Scale == 0; } bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { @@ -438,8 +629,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, if (AM.BaseGV) return false; - switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: { + if (AS == AMDGPUASI.GLOBAL_ADDRESS) { if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { // Assume the we will use FLAT for all global memory accesses // on VI. @@ -454,8 +644,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } return isLegalMUBUFAddressingMode(AM); - } - case AMDGPUAS::CONSTANT_ADDRESS: { + } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -478,7 +667,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // in 8-bits, it can use a smaller encoding. if (!isUInt<32>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) { + } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { // On VI, these use the SMEM format and the offset is 20-bit in bytes. if (!isUInt<20>(AM.BaseOffs)) return false; @@ -492,13 +681,11 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return true; return false; - } - case AMDGPUAS::PRIVATE_ADDRESS: + } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { return isLegalMUBUFAddressingMode(AM); - - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::REGION_ADDRESS: { + } else if (AS == AMDGPUASI.LOCAL_ADDRESS || + AS == AMDGPUASI.REGION_ADDRESS) { // Basic, single offset DS instructions allow a 16-bit unsigned immediate // field. // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have @@ -513,21 +700,32 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return true; return false; - } - case AMDGPUAS::FLAT_ADDRESS: - case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: + } else if (AS == AMDGPUASI.FLAT_ADDRESS || + AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) { // For an unknown address space, this usually means that this is for some // reason being used for pure arithmetic, and not based on some addressing // computation. We don't have instructions that compute pointers with any // addressing modes, so treat them as having no offset like flat // instructions. return isLegalFlatAddressingMode(AM); - - default: + } else { llvm_unreachable("unhandled address space"); } } +bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, + const SelectionDAG &DAG) const { + if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { + return (MemVT.getSizeInBits() <= 4 * 32); + } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { + unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); + return (MemVT.getSizeInBits() <= MaxPrivateBits); + } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { + return (MemVT.getSizeInBits() <= 2 * 32); + } + return true; +} + bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, @@ -544,8 +742,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return false; } - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || - AddrSpace == AMDGPUAS::REGION_ADDRESS) { + if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS || + AddrSpace == AMDGPUASI.REGION_ADDRESS) { // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte // aligned, 8 byte access in a single operation using ds_read2/write2_b32 // with adjacent offsets. @@ -560,8 +758,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // will access scratch. If we had access to the IR function, then we // could determine if any private memory was used in the function. if (!Subtarget->hasUnalignedScratchAccess() && - (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || - AddrSpace == AMDGPUAS::FLAT_ADDRESS)) { + (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS || + AddrSpace == AMDGPUASI.FLAT_ADDRESS)) { return false; } @@ -569,7 +767,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { - *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ? + *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ? (Align % 4 == 0) : true; } @@ -609,15 +807,16 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, return MVT::Other; } -static bool isFlatGlobalAddrSpace(unsigned AS) { - return AS == AMDGPUAS::GLOBAL_ADDRESS || - AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS; +static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) { + return AS == AMDGPUASI.GLOBAL_ADDRESS || + AS == AMDGPUASI.FLAT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS; } bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { - return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); + return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) && + isFlatGlobalAddrSpace(DestAS, AMDGPUASI); } bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { @@ -631,7 +830,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { // Flat -> private/local is a simple truncate. // Flat -> global is no-op - if (SrcAS == AMDGPUAS::FLAT_ADDRESS) + if (SrcAS == AMDGPUASI.FLAT_ADDRESS) return true; return isNoopAddrSpaceCast(SrcAS, DestAS); @@ -639,18 +838,8 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, bool SITargetLowering::isMemOpUniform(const SDNode *N) const { const MemSDNode *MemNode = cast<MemSDNode>(N); - const Value *Ptr = MemNode->getMemOperand()->getValue(); - - // UndefValue means this is a load of a kernel input. These are uniform. - // Sometimes LDS instructions have constant pointers. - // If Ptr is null, then that means this mem operand contains a - // PseudoSourceValue like GOT. - if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || - isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) - return true; - const Instruction *I = dyn_cast<Instruction>(Ptr); - return I && I->getMetadata("amdgpu.uniform"); + return AMDGPU::isUniformMMO(MemNode->getMemOperand()); } TargetLoweringBase::LegalizeTypeAction @@ -693,40 +882,28 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { return TargetLowering::isTypeDesirableForOp(Op, VT); } -SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, - const SDLoc &SL, SDValue Chain, - unsigned Offset) const { +SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Chain, + uint64_t Offset) const { const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, + SIRegisterInfo::KERNARG_SEGMENT_PTR); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg), PtrVT); return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Offset, SL, PtrVT)); } -SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, - const SDLoc &SL, SDValue Chain, - unsigned Offset, bool Signed, +SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Val, + bool Signed, const ISD::InputArg *Arg) const { - const DataLayout &DL = DAG.getDataLayout(); - Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); - MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - - unsigned Align = DL.getABITypeAlignment(Ty); - - SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset); - SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, - MachineMemOperand::MONonTemporal | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); - - SDValue Val = Load; if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) { unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; @@ -740,373 +917,545 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, else Val = DAG.getZExtOrTrunc(Val, SL, VT); - return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); + return Val; } -SDValue SITargetLowering::LowerFormalArguments( - SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, - SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); +SDValue SITargetLowering::lowerKernargMemParameter( + SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Chain, + uint64_t Offset, bool Signed, + const ISD::InputArg *Arg) const { + const DataLayout &DL = DAG.getDataLayout(); + Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + unsigned Align = DL.getABITypeAlignment(Ty); + + SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); + SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, + MachineMemOperand::MONonTemporal | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); + + SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); + return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); +} + +SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, + const SDLoc &SL, SDValue Chain, + const ISD::InputArg &Arg) const { MachineFunction &MF = DAG.getMachineFunction(); - FunctionType *FType = MF.getFunction()->getFunctionType(); - SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + MachineFrameInfo &MFI = MF.getFrameInfo(); - if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { - const Function *Fn = MF.getFunction(); - DiagnosticInfoUnsupported NoGraphicsHSA( - *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); - DAG.getContext()->diagnose(NoGraphicsHSA); - return DAG.getEntryNode(); + if (Arg.Flags.isByVal()) { + unsigned Size = Arg.Flags.getByValSize(); + int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false); + return DAG.getFrameIndex(FrameIdx, MVT::i32); } - // Create stack objects that are used for emitting debugger prologue if - // "amdgpu-debugger-emit-prologue" attribute was specified. - if (ST.debuggerEmitPrologue()) - createDebuggerPrologueStackObjects(MF); + unsigned ArgOffset = VA.getLocMemOffset(); + unsigned ArgSize = VA.getValVT().getStoreSize(); - SmallVector<ISD::InputArg, 16> Splits; - BitVector Skipped(Ins.size()); + int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true); - for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { - const ISD::InputArg &Arg = Ins[i]; + // Create load nodes to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); + SDValue ArgValue; + + // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) + ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; + MVT MemVT = VA.getValVT(); - // First check if it's a PS input addr + switch (VA.getLocInfo()) { + default: + break; + case CCValAssign::BCvt: + MemVT = VA.getLocVT(); + break; + case CCValAssign::SExt: + ExtType = ISD::SEXTLOAD; + break; + case CCValAssign::ZExt: + ExtType = ISD::ZEXTLOAD; + break; + case CCValAssign::AExt: + ExtType = ISD::EXTLOAD; + break; + } + + ArgValue = DAG.getExtLoad( + ExtType, SL, VA.getLocVT(), Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + MemVT); + return ArgValue; +} + +static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, + CallingConv::ID CallConv, + ArrayRef<ISD::InputArg> Ins, + BitVector &Skipped, + FunctionType *FType, + SIMachineFunctionInfo *Info) { + for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { + const ISD::InputArg &Arg = Ins[I]; + + // First check if it's a PS input addr. if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && !Arg.Flags.isByVal() && PSInputNum <= 15) { if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { - // We can safely skip PS inputs - Skipped.set(i); + // We can safely skip PS inputs. + Skipped.set(I); ++PSInputNum; continue; } Info->markPSInputAllocated(PSInputNum); if (Arg.Used) - Info->PSInputEna |= 1 << PSInputNum; + Info->markPSInputEnabled(PSInputNum); ++PSInputNum; } - if (AMDGPU::isShader(CallConv)) { - // Second split vertices into their elements - if (Arg.VT.isVector()) { - ISD::InputArg NewArg = Arg; - NewArg.Flags.setSplit(); - NewArg.VT = Arg.VT.getVectorElementType(); - - // We REALLY want the ORIGINAL number of vertex elements here, e.g. a - // three or five element vertex only needs three or five registers, - // NOT four or eight. - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - for (unsigned j = 0; j != NumElements; ++j) { - Splits.push_back(NewArg); - NewArg.PartOffset += NewArg.VT.getStoreSize(); - } - } else { - Splits.push_back(Arg); + // Second split vertices into their elements. + if (Arg.VT.isVector()) { + ISD::InputArg NewArg = Arg; + NewArg.Flags.setSplit(); + NewArg.VT = Arg.VT.getVectorElementType(); + + // We REALLY want the ORIGINAL number of vertex elements here, e.g. a + // three or five element vertex only needs three or five registers, + // NOT four or eight. + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + for (unsigned J = 0; J != NumElements; ++J) { + Splits.push_back(NewArg); + NewArg.PartOffset += NewArg.VT.getStoreSize(); } + } else { + Splits.push_back(Arg); } } +} - SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); +// Allocate special inputs passed in VGPRs. +static void allocateSpecialInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + if (Info.hasWorkItemIDX()) { + unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } - // At least one interpolation mode must be enabled or else the GPU will hang. - // - // Check PSInputAddr instead of PSInputEna. The idea is that if the user set - // PSInputAddr, the user wants to enable some bits after the compilation - // based on run-time states. Since we can't know what the final PSInputEna - // will look like, so we shouldn't do anything here and the user should take - // responsibility for the correct programming. - // - // Otherwise, the following restrictions apply: - // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. - // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be - // enabled too. - if (CallConv == CallingConv::AMDGPU_PS && - ((Info->getPSInputAddr() & 0x7F) == 0 || - ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) { - CCInfo.AllocateReg(AMDGPU::VGPR0); - CCInfo.AllocateReg(AMDGPU::VGPR1); - Info->markPSInputAllocated(0); - Info->PSInputEna |= 1; - } - - if (!AMDGPU::isShader(CallConv)) { - assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); - } else { - assert(!Info->hasDispatchPtr() && - !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && - !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && - !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && - !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && - !Info->hasWorkItemIDZ()); + if (Info.hasWorkItemIDY()) { + unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); } - if (Info->hasPrivateMemoryInputPtr()) { - unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI); - MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass); - CCInfo.AllocateReg(PrivateMemoryPtrReg); + if (Info.hasWorkItemIDZ()) { + unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } +} + +// Allocate special inputs passed in user SGPRs. +static void allocateHSAUserSGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + if (Info.hasImplicitBufferPtr()) { + unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); + MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(ImplicitBufferPtrReg); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? - if (Info->hasPrivateSegmentBuffer()) { - unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); - MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + if (Info.hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); CCInfo.AllocateReg(PrivateSegmentBufferReg); } - if (Info->hasDispatchPtr()) { - unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + if (Info.hasDispatchPtr()) { + unsigned DispatchPtrReg = Info.addDispatchPtr(TRI); MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } - if (Info->hasQueuePtr()) { - unsigned QueuePtrReg = Info->addQueuePtr(*TRI); + if (Info.hasQueuePtr()) { + unsigned QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } - if (Info->hasKernargSegmentPtr()) { - unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + if (Info.hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI); MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(InputPtrReg); } - if (Info->hasDispatchID()) { - unsigned DispatchIDReg = Info->addDispatchID(*TRI); + if (Info.hasDispatchID()) { + unsigned DispatchIDReg = Info.addDispatchID(TRI); MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchIDReg); } - if (Info->hasFlatScratchInit()) { - unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + if (Info.hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); } - if (!AMDGPU::isShader(CallConv)) - analyzeFormalArgumentsCompute(CCInfo, Ins); - else - AnalyzeFormalArguments(CCInfo, Splits); - - SmallVector<SDValue, 16> Chains; - - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { - - const ISD::InputArg &Arg = Ins[i]; - if (Skipped[i]) { - InVals.push_back(DAG.getUNDEF(Arg.VT)); - continue; - } - - CCValAssign &VA = ArgLocs[ArgIdx++]; - MVT VT = VA.getLocVT(); - - if (VA.isMemLoc()) { - VT = Ins[i].VT; - EVT MemVT = VA.getLocVT(); - const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + - VA.getLocMemOffset(); - // The first 36 bytes of the input buffer contains information about - // thread group and global sizes. - SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, - Offset, Ins[i].Flags.isSExt(), - &Ins[i]); - Chains.push_back(Arg.getValue(1)); - - auto *ParamTy = - dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); - if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && - ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - // On SI local pointers are just offsets into LDS, so they are always - // less than 16-bits. On CI and newer they could potentially be - // real pointers, so we can't guarantee their size. - Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, - DAG.getValueType(MVT::i16)); - } - - InVals.push_back(Arg); - Info->setABIArgOffset(Offset + MemVT.getStoreSize()); - continue; - } - assert(VA.isRegLoc() && "Parameter must be in a register!"); - - unsigned Reg = VA.getLocReg(); - - if (VT == MVT::i64) { - // For now assume it is a pointer - Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, - &AMDGPU::SGPR_64RegClass); - Reg = MF.addLiveIn(Reg, &AMDGPU::SGPR_64RegClass); - SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); - InVals.push_back(Copy); - continue; - } - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - - Reg = MF.addLiveIn(Reg, RC); - SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - - if (Arg.VT.isVector()) { - - // Build a vector from the registers - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - SmallVector<SDValue, 4> Regs; - Regs.push_back(Val); - for (unsigned j = 1; j != NumElements; ++j) { - Reg = ArgLocs[ArgIdx++].getLocReg(); - Reg = MF.addLiveIn(Reg, RC); - - SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); - Regs.push_back(Copy); - } - - // Fill up the missing vector elements - NumElements = Arg.VT.getVectorNumElements() - NumElements; - Regs.append(NumElements, DAG.getUNDEF(VT)); - - InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); - continue; - } - - InVals.push_back(Val); - } - // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read // these from the dispatch pointer. +} - // Start adding system SGPRs. - if (Info->hasWorkGroupIDX()) { - unsigned Reg = Info->addWorkGroupIDX(); +// Allocate special input registers that are initialized per-wave. +static void allocateSystemSGPRs(CCState &CCInfo, + MachineFunction &MF, + SIMachineFunctionInfo &Info, + CallingConv::ID CallConv, + bool IsShader) { + if (Info.hasWorkGroupIDX()) { + unsigned Reg = Info.addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } - if (Info->hasWorkGroupIDY()) { - unsigned Reg = Info->addWorkGroupIDY(); + if (Info.hasWorkGroupIDY()) { + unsigned Reg = Info.addWorkGroupIDY(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } - if (Info->hasWorkGroupIDZ()) { - unsigned Reg = Info->addWorkGroupIDZ(); + if (Info.hasWorkGroupIDZ()) { + unsigned Reg = Info.addWorkGroupIDZ(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } - if (Info->hasWorkGroupInfo()) { - unsigned Reg = Info->addWorkGroupInfo(); + if (Info.hasWorkGroupInfo()) { + unsigned Reg = Info.addWorkGroupInfo(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } - if (Info->hasPrivateSegmentWaveByteOffset()) { + if (Info.hasPrivateSegmentWaveByteOffset()) { // Scratch wave offset passed in system SGPR. unsigned PrivateSegmentWaveByteOffsetReg; - if (AMDGPU::isShader(CallConv)) { - PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); - Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); + if (IsShader) { + PrivateSegmentWaveByteOffsetReg = + Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); + + // This is true if the scratch wave byte offset doesn't have a fixed + // location. + if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { + PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); + Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); + } } else - PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset(); + PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); } +} +static void reservePrivateMemoryRegs(const TargetMachine &TM, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. - bool HasStackObjects = MF.getFrameInfo().hasStackObjects(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + bool HasStackObjects = MFI.hasStackObjects(); + // Record that we know we have non-spill stack objects so we don't need to // check all stack objects later. if (HasStackObjects) - Info->setHasNonSpillStackObjects(true); + Info.setHasNonSpillStackObjects(true); // Everything live out of a block is spilled with fast regalloc, so it's // almost certain that spilling will be required. - if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + if (TM.getOptLevel() == CodeGenOpt::None) HasStackObjects = true; + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); if (ST.isAmdCodeObjectV2(MF)) { if (HasStackObjects) { // If we have stack objects, we unquestionably need the private buffer // resource. For the Code Object V2 ABI, this will be the first 4 user // SGPR inputs. We can reserve those and use them directly. - unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( + unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); - Info->setScratchRSrcReg(PrivateSegmentBufferReg); + Info.setScratchRSrcReg(PrivateSegmentBufferReg); - unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( + unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); } else { unsigned ReservedBufferReg - = TRI->reservedPrivateSegmentBufferReg(MF); + = TRI.reservedPrivateSegmentBufferReg(MF); unsigned ReservedOffsetReg - = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); // We tentatively reserve the last registers (skipping the last two // which may contain VCC). After register allocation, we'll replace // these with the ones immediately after those which were really // allocated. In the prologue copies will be inserted from the argument // to these reserved registers. - Info->setScratchRSrcReg(ReservedBufferReg); - Info->setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setScratchRSrcReg(ReservedBufferReg); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); } } else { - unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); + unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); // Without HSA, relocations are used for the scratch pointer and the // buffer resource setup is always inserted in the prologue. Scratch wave // offset is still in an input SGPR. - Info->setScratchRSrcReg(ReservedBufferReg); + Info.setScratchRSrcReg(ReservedBufferReg); if (HasStackObjects) { - unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( + unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); } else { unsigned ReservedOffsetReg - = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); - Info->setScratchWaveOffsetReg(ReservedOffsetReg); + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); } } +} - if (Info->hasWorkItemIDX()) { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - CCInfo.AllocateReg(Reg); +SDValue SITargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + + MachineFunction &MF = DAG.getMachineFunction(); + FunctionType *FType = MF.getFunction()->getFunctionType(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + + if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { + const Function *Fn = MF.getFunction(); + DiagnosticInfoUnsupported NoGraphicsHSA( + *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); + DAG.getContext()->diagnose(NoGraphicsHSA); + return DAG.getEntryNode(); } - if (Info->hasWorkItemIDY()) { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - CCInfo.AllocateReg(Reg); + // Create stack objects that are used for emitting debugger prologue if + // "amdgpu-debugger-emit-prologue" attribute was specified. + if (ST.debuggerEmitPrologue()) + createDebuggerPrologueStackObjects(MF); + + SmallVector<ISD::InputArg, 16> Splits; + SmallVector<CCValAssign, 16> ArgLocs; + BitVector Skipped(Ins.size()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + bool IsShader = AMDGPU::isShader(CallConv); + bool IsKernel = AMDGPU::isKernel(CallConv); + bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); + + if (IsShader) { + processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); + + // At least one interpolation mode must be enabled or else the GPU will + // hang. + // + // Check PSInputAddr instead of PSInputEnable. The idea is that if the user + // set PSInputAddr, the user wants to enable some bits after the compilation + // based on run-time states. Since we can't know what the final PSInputEna + // will look like, so we shouldn't do anything here and the user should take + // responsibility for the correct programming. + // + // Otherwise, the following restrictions apply: + // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. + // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be + // enabled too. + if (CallConv == CallingConv::AMDGPU_PS && + ((Info->getPSInputAddr() & 0x7F) == 0 || + ((Info->getPSInputAddr() & 0xF) == 0 && + Info->isPSInputAllocated(11)))) { + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + Info->markPSInputAllocated(0); + Info->markPSInputEnabled(0); + } + + assert(!Info->hasDispatchPtr() && + !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && + !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && + !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && + !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && + !Info->hasWorkItemIDZ()); + } else if (IsKernel) { + assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); + } else { + Splits.append(Ins.begin(), Ins.end()); } - if (Info->hasWorkItemIDZ()) { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - CCInfo.AllocateReg(Reg); + if (IsEntryFunc) { + allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); + allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); } - if (Chains.empty()) - return Chain; + if (IsKernel) { + analyzeFormalArgumentsCompute(CCInfo, Ins); + } else { + CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); + CCInfo.AnalyzeFormalArguments(Splits, AssignFn); + } + + SmallVector<SDValue, 16> Chains; + + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + const ISD::InputArg &Arg = Ins[i]; + if (Skipped[i]) { + InVals.push_back(DAG.getUNDEF(Arg.VT)); + continue; + } + + CCValAssign &VA = ArgLocs[ArgIdx++]; + MVT VT = VA.getLocVT(); + + if (IsEntryFunc && VA.isMemLoc()) { + VT = Ins[i].VT; + EVT MemVT = VA.getLocVT(); + + const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) + + VA.getLocMemOffset(); + Info->setABIArgOffset(Offset + MemVT.getStoreSize()); + + // The first 36 bytes of the input buffer contains information about + // thread group and global sizes. + SDValue Arg = lowerKernargMemParameter( + DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]); + Chains.push_back(Arg.getValue(1)); + + auto *ParamTy = + dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && + ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + // On SI local pointers are just offsets into LDS, so they are always + // less than 16-bits. On CI and newer they could potentially be + // real pointers, so we can't guarantee their size. + Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, + DAG.getValueType(MVT::i16)); + } + + InVals.push_back(Arg); + continue; + } else if (!IsEntryFunc && VA.isMemLoc()) { + SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); + InVals.push_back(Val); + if (!Arg.Flags.isByVal()) + Chains.push_back(Val.getValue(1)); + continue; + } + + assert(VA.isRegLoc() && "Parameter must be in a register!"); + + unsigned Reg = VA.getLocReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + EVT ValVT = VA.getValVT(); + + Reg = MF.addLiveIn(Reg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + // If this is an 8 or 16-bit value, it is really passed promoted + // to 32 bits. Insert an assert[sz]ext to capture this, then + // truncate to the right size. + switch (VA.getLocInfo()) { + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val); + break; + case CCValAssign::SExt: + Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, + DAG.getValueType(ValVT)); + Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); + break; + case CCValAssign::ZExt: + Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, + DAG.getValueType(ValVT)); + Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); + break; + case CCValAssign::AExt: + Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); + break; + default: + llvm_unreachable("Unknown loc info!"); + } + + if (IsShader && Arg.VT.isVector()) { + // Build a vector from the registers + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + SmallVector<SDValue, 4> Regs; + Regs.push_back(Val); + for (unsigned j = 1; j != NumElements; ++j) { + Reg = ArgLocs[ArgIdx++].getLocReg(); + Reg = MF.addLiveIn(Reg, RC); + + SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); + Regs.push_back(Copy); + } + + // Fill up the missing vector elements + NumElements = Arg.VT.getVectorNumElements() - NumElements; + Regs.append(NumElements, DAG.getUNDEF(VT)); + + InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); + continue; + } + + InVals.push_back(Val); + } + + // Start adding system SGPRs. + if (IsEntryFunc) { + allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); + } else { + CCInfo.AllocateReg(Info->getScratchRSrcReg()); + CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); + CCInfo.AllocateReg(Info->getFrameOffsetReg()); + } + + return Chains.empty() ? Chain : + DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); +} + +// TODO: If return values can't fit in registers, we should return as many as +// possible in registers before passing on stack. +bool SITargetLowering::CanLowerReturn( + CallingConv::ID CallConv, + MachineFunction &MF, bool IsVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const { + // Replacing returns with sret/stack usage doesn't make sense for shaders. + // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn + // for shaders. Vector types should be explicitly handled by CC. + if (AMDGPU::isEntryFunctionCC(CallConv)) + return true; + + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); + return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)); } SDValue @@ -1118,11 +1467,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - if (!AMDGPU::isShader(CallConv)) + if (AMDGPU::isKernel(CallConv)) { return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, OutVals, DL, DAG); + } + + bool IsShader = AMDGPU::isShader(CallConv); Info->setIfReturnsVoid(Outs.size() == 0); + bool IsWaveEnd = Info->returnsVoid() && IsShader; SmallVector<ISD::OutputArg, 48> Splits; SmallVector<SDValue, 48> SplitVals; @@ -1131,7 +1484,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, for (unsigned i = 0, e = Outs.size(); i != e; ++i) { const ISD::OutputArg &Out = Outs[i]; - if (Out.VT.isVector()) { + if (IsShader && Out.VT.isVector()) { MVT VT = Out.VT.getVectorElementType(); ISD::OutputArg NewOut = Out; NewOut.Flags.setSplit(); @@ -1162,29 +1515,58 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, *DAG.getContext()); // Analyze outgoing return values. - AnalyzeReturn(CCInfo, Splits); + CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg)); SDValue Flag; SmallVector<SDValue, 48> RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + // Add return address for callable functions. + if (!Info->isEntryFunction()) { + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + SDValue ReturnAddrReg = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); + + // FIXME: Should be able to use a vreg here, but need a way to prevent it + // from being allcoated to a CSR. + + SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), + MVT::i64); + + Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag); + Flag = Chain.getValue(1); + + RetOps.push_back(PhysReturnAddrReg); + } + // Copy the result values into the output registers. for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); + // TODO: Partially return in registers if return values don't fit. SDValue Arg = SplitVals[realRVLocIdx]; // Copied from other backends. switch (VA.getLocInfo()) { - default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); + break; + default: + llvm_unreachable("Unknown loc info!"); } Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); @@ -1192,12 +1574,16 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + // FIXME: Does sret work properly? + // Update chain and glue. RetOps[0] = Chain; if (Flag.getNode()) RetOps.push_back(Flag); - unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN; + unsigned Opc = AMDGPUISD::ENDPGM; + if (!IsWaveEnd) + Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG; return DAG.getNode(Opc, DL, MVT::Other, RetOps); } @@ -1436,7 +1822,7 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset) { - int NumElts = SuperRC->getSize() / 4; + int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32; // Skip out of bounds offsets, or else we would end up using an undefined // register. @@ -1470,16 +1856,16 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; if (Offset == 0) { MachineInstr *SetOn = - BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) - .addOperand(*Idx) - .addImm(IdxMode); + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) + .add(*Idx) + .addImm(IdxMode); SetOn->getOperand(3).setIsUndef(); } else { unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) - .addOperand(*Idx) - .addImm(Offset); + .add(*Idx) + .addImm(Offset); MachineInstr *SetOn = BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) .addReg(Tmp, RegState::Kill) @@ -1493,10 +1879,10 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, if (Offset == 0) { BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addOperand(*Idx); + .add(*Idx); } else { BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addOperand(*Idx) + .add(*Idx) .addImm(Offset); } @@ -1522,7 +1908,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); - bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; + bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode); if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) { MachineBasicBlock::iterator I(&MI); @@ -1548,7 +1934,6 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, return &MBB; } - const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); @@ -1586,17 +1971,18 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, return LoopBB; } -static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) { - switch (VecRC->getSize()) { - case 4: +static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI, + const TargetRegisterClass *VecRC) { + switch (TRI.getRegSizeInBits(*VecRC)) { + case 32: // 4 bytes return AMDGPU::V_MOVRELD_B32_V1; - case 8: + case 64: // 8 bytes return AMDGPU::V_MOVRELD_B32_V2; - case 16: + case 128: // 16 bytes return AMDGPU::V_MOVRELD_B32_V4; - case 32: + case 256: // 32 bytes return AMDGPU::V_MOVRELD_B32_V8; - case 64: + case 512: // 64 bytes return AMDGPU::V_MOVRELD_B32_V16; default: llvm_unreachable("unsupported size for MOVRELD pseudos"); @@ -1625,7 +2011,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset); - bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; + bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode); if (Idx->getReg() == AMDGPU::NoRegister) { MachineBasicBlock::iterator I(&MI); @@ -1634,9 +2020,9 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, assert(Offset == 0); BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) - .addOperand(*SrcVec) - .addOperand(*Val) - .addImm(SubReg); + .add(*SrcVec) + .add(*Val) + .addImm(SubReg); MI.eraseFromParent(); return &MBB; @@ -1648,20 +2034,20 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, if (UseGPRIdxMode) { BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) - .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst - .addOperand(*Val) - .addReg(Dst, RegState::ImplicitDefine) - .addReg(SrcVec->getReg(), RegState::Implicit) - .addReg(AMDGPU::M0, RegState::Implicit); + .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst + .add(*Val) + .addReg(Dst, RegState::ImplicitDefine) + .addReg(SrcVec->getReg(), RegState::Implicit) + .addReg(AMDGPU::M0, RegState::Implicit); BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { - const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); + const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); BuildMI(MBB, I, DL, MovRelDesc) .addReg(Dst, RegState::Define) .addReg(SrcVec->getReg()) - .addOperand(*Val) + .add(*Val) .addImm(SubReg - AMDGPU::sub0); } @@ -1694,18 +2080,18 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, if (UseGPRIdxMode) { BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) - .addReg(PhiReg, RegState::Undef, SubReg) // vdst - .addOperand(*Val) // src0 - .addReg(Dst, RegState::ImplicitDefine) - .addReg(PhiReg, RegState::Implicit) - .addReg(AMDGPU::M0, RegState::Implicit); + .addReg(PhiReg, RegState::Undef, SubReg) // vdst + .add(*Val) // src0 + .addReg(Dst, RegState::ImplicitDefine) + .addReg(PhiReg, RegState::Implicit) + .addReg(AMDGPU::M0, RegState::Implicit); } else { - const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); + const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); BuildMI(*LoopBB, InsPt, DL, MovRelDesc) .addReg(Dst, RegState::Define) .addReg(PhiReg) - .addOperand(*Val) + .add(*Val) .addImm(SubReg - AMDGPU::sub0); } @@ -1741,18 +2127,76 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( } switch (MI.getOpcode()) { - case AMDGPU::SI_INIT_M0: { + case AMDGPU::SI_INIT_M0: BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addOperand(MI.getOperand(0)); + .add(MI.getOperand(0)); + MI.eraseFromParent(); + return BB; + + case AMDGPU::SI_INIT_EXEC: + // This should be before all vector instructions. + BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), + AMDGPU::EXEC) + .addImm(MI.getOperand(0).getImm()); + MI.eraseFromParent(); + return BB; + + case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { + // Extract the thread count from an SGPR input and set EXEC accordingly. + // Since BFM can't shift by 64, handle that case with CMP + CMOV. + // + // S_BFE_U32 count, input, {shift, 7} + // S_BFM_B64 exec, count, 0 + // S_CMP_EQ_U32 count, 64 + // S_CMOV_B64 exec, -1 + MachineInstr *FirstMI = &*BB->begin(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned InputReg = MI.getOperand(0).getReg(); + unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + bool Found = false; + + // Move the COPY of the input reg to the beginning, so that we can use it. + for (auto I = BB->begin(); I != &MI; I++) { + if (I->getOpcode() != TargetOpcode::COPY || + I->getOperand(0).getReg() != InputReg) + continue; + + if (I == FirstMI) { + FirstMI = &*++BB->begin(); + } else { + I->removeFromParent(); + BB->insert(FirstMI, &*I); + } + Found = true; + break; + } + assert(Found); + (void)Found; + + // This should be before all vector instructions. + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) + .addReg(InputReg) + .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000); + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64), + AMDGPU::EXEC) + .addReg(CountReg) + .addImm(0); + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) + .addReg(CountReg, RegState::Kill) + .addImm(64); + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64), + AMDGPU::EXEC) + .addImm(-1); MI.eraseFromParent(); return BB; } + case AMDGPU::GET_GROUPSTATICSIZE: { DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) - .addOperand(MI.getOperand(0)) - .addImm(MFI->getLDSSize()); + .add(MI.getOperand(0)) + .addImm(MFI->getLDSSize()); MI.eraseFromParent(); return BB; } @@ -1803,7 +2247,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) - .addOperand(MI.getOperand(0)); + .add(MI.getOperand(0)); Br->getOperand(1).setIsUndef(true); // read undef SCC MI.eraseFromParent(); return BB; @@ -1856,9 +2300,6 @@ MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); - if (!VT.isSimple()) - return false; - switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: // This is as fast on some subtargets. However, we always have full rate f32 @@ -1909,13 +2350,74 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); - case ISD::TRAP: return lowerTRAP(Op, DAG); + case ISD::INSERT_VECTOR_ELT: + return lowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: + return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); + + case ISD::TRAP: + case ISD::DEBUGTRAP: + return lowerTRAP(Op, DAG); } return SDValue(); } +void SITargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + case ISD::INSERT_VECTOR_ELT: { + if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG)) + Results.push_back(Res); + return; + } + case ISD::EXTRACT_VECTOR_ELT: { + if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG)) + Results.push_back(Res); + return; + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + if (IID == Intrinsic::amdgcn_cvt_pkrtz) { + SDValue Src0 = N->getOperand(1); + SDValue Src1 = N->getOperand(2); + SDLoc SL(N); + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, + Src0, Src1); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt)); + return; + } + break; + } + case ISD::SELECT: { + SDLoc SL(N); + EVT VT = N->getValueType(0); + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); + SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1)); + SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2)); + + EVT SelectVT = NewVT; + if (NewVT.bitsLT(MVT::i32)) { + LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS); + RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS); + SelectVT = MVT::i32; + } + + SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT, + N->getOperand(0), LHS, RHS); + + if (NewVT != SelectVT) + NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect)); + return; + } + default: + break; + } +} + /// \brief Helper function for LowerBRCOND static SDNode *findUser(SDValue Value, unsigned Opcode) { @@ -1932,31 +2434,25 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { return nullptr; } -bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { +unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { - case AMDGPUIntrinsic::amdgcn_if: - case AMDGPUIntrinsic::amdgcn_else: - case AMDGPUIntrinsic::amdgcn_end_cf: - case AMDGPUIntrinsic::amdgcn_loop: - return true; + case Intrinsic::amdgcn_if: + return AMDGPUISD::IF; + case Intrinsic::amdgcn_else: + return AMDGPUISD::ELSE; + case Intrinsic::amdgcn_loop: + return AMDGPUISD::LOOP; + case Intrinsic::amdgcn_end_cf: + llvm_unreachable("should not occur"); default: - return false; + return 0; } } - if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) { - switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) { - case AMDGPUIntrinsic::amdgcn_break: - case AMDGPUIntrinsic::amdgcn_if_break: - case AMDGPUIntrinsic::amdgcn_else_break: - return true; - default: - return false; - } - } - - return false; + // break, if_break, else_break are all only used as inputs to loop, not + // directly as branch conditions. + return 0; } void SITargetLowering::createDebuggerPrologueStackObjects( @@ -1987,13 +2483,13 @@ void SITargetLowering::createDebuggerPrologueStackObjects( bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); - return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && AMDGPU::shouldEmitConstantsToTextSection(TT); } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { - return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) && + return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); } @@ -2006,7 +2502,6 @@ bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const { - SDLoc DL(BRCOND); SDNode *Intr = BRCOND.getOperand(1).getNode(); @@ -2032,7 +2527,8 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088> - if (!isCFIntrinsic(Intr)) { + unsigned CFNode = isCFIntrinsic(Intr); + if (CFNode == 0) { // This is a uniform branch so we don't need to legalize. return BRCOND; } @@ -2050,15 +2546,13 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, if (HaveChain) Ops.push_back(BRCOND.getOperand(0)); - Ops.append(Intr->op_begin() + (HaveChain ? 1 : 0), Intr->op_end()); + Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end()); Ops.push_back(Target); ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); // build the new intrinsic call - SDNode *Result = DAG.getNode( - Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, - DAG.getVTList(Res), Ops).getNode(); + SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode(); if (!HaveChain) { SDValue Ops[] = { @@ -2127,12 +2621,82 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); - return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);; + return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); +} + +SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Chain = Op.getOperand(0); + + unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ? + SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap; + + if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa && + Subtarget->isTrapHandlerEnabled()) { + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + unsigned UserSGPR = Info->getQueuePtrUserSGPR(); + assert(UserSGPR != AMDGPU::NoRegister); + + SDValue QueuePtr = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + + SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); + + SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, + QueuePtr, SDValue()); + + SDValue Ops[] = { + ToReg, + DAG.getTargetConstant(TrapID, SL, MVT::i16), + SGPR01, + ToReg.getValue(1) + }; + + return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); + } + + switch (TrapID) { + case SISubtarget::TrapIDLLVMTrap: + return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); + case SISubtarget::TrapIDLLVMDebugTrap: { + DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), + "debugtrap handler not supported", + Op.getDebugLoc(), + DS_Warning); + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.diagnose(NoTrap); + return Chain; + } + default: + llvm_unreachable("unsupported trap handler type!"); + } + + return Chain; } -SDValue SITargetLowering::getSegmentAperture(unsigned AS, +SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const { - SDLoc SL; + // FIXME: Use inline constants (src_{shared, private}_base) instead. + if (Subtarget->hasApertureRegs()) { + unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ? + AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : + AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; + unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ? + AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : + AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; + unsigned Encoding = + AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | + Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | + WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; + + SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16); + SDValue ApertureReg = SDValue( + DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0); + SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32); + return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount); + } + MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); unsigned UserSGPR = Info->getQueuePtrUserSGPR(); @@ -2143,19 +2707,19 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, // Offset into amd_queue_t for group_segment_aperture_base_hi / // private_segment_aperture_base_hi. - uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; + uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44; - SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr, - DAG.getConstant(StructOffset, SL, MVT::i64)); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr, + DAG.getConstant(StructOffset, DL, MVT::i64)); // TODO: Use custom target PseudoSourceValue. // TODO: We should use the value from the IR intrinsic call, but it might not // be available and how do we get it? Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), - AMDGPUAS::CONSTANT_ADDRESS)); + AMDGPUASI.CONSTANT_ADDRESS)); MachinePointerInfo PtrInfo(V, StructOffset); - return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo, + return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, MinAlign(64, StructOffset), MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); @@ -2167,15 +2731,19 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op); SDValue Src = ASC->getOperand(0); - - // FIXME: Really support non-0 null pointers. - SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32); SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); + const AMDGPUTargetMachine &TM = + static_cast<const AMDGPUTargetMachine &>(getTargetMachine()); + // flat -> local/private - if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { - if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) { + unsigned DestAS = ASC->getDestAddressSpace(); + + if (DestAS == AMDGPUASI.LOCAL_ADDRESS || + DestAS == AMDGPUASI.PRIVATE_ADDRESS) { + unsigned NullVal = TM.getNullPointerValue(DestAS); + SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); @@ -2185,13 +2753,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, } // local/private -> flat - if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { - if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) { + unsigned SrcAS = ASC->getSrcAddressSpace(); + + if (SrcAS == AMDGPUASI.LOCAL_ADDRESS || + SrcAS == AMDGPUASI.PRIVATE_ADDRESS) { + unsigned NullVal = TM.getNullPointerValue(SrcAS); + SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); + SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); - SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG); + SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG); SDValue CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); @@ -2211,17 +2784,97 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, return DAG.getUNDEF(ASC->getValueType(0)); } +SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Idx = Op.getOperand(2); + if (isa<ConstantSDNode>(Idx)) + return SDValue(); + + // Avoid stack access for dynamic indexing. + SDLoc SL(Op); + SDValue Vec = Op.getOperand(0); + SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1)); + + // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec + SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val); + + // Convert vector index to bit-index. + SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, + DAG.getConstant(16, SL, MVT::i32)); + + SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + + SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32, + DAG.getConstant(0xffff, SL, MVT::i32), + ScaledIdx); + + SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal); + SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32, + DAG.getNOT(SL, BFM, MVT::i32), BCVec); + + SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS); + return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI); +} + +SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + + EVT ResultVT = Op.getValueType(); + SDValue Vec = Op.getOperand(0); + SDValue Idx = Op.getOperand(1); + + DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); + + // Make sure we we do any optimizations that will make it easier to fold + // source modifiers before obscuring it with bit operations. + + // XXX - Why doesn't this get called when vector_shuffle is expanded? + if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) + return Combined; + + if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) { + SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + + if (CIdx->getZExtValue() == 1) { + Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result, + DAG.getConstant(16, SL, MVT::i32)); + } else { + assert(CIdx->getZExtValue() == 0); + } + + if (ResultVT.bitsLT(MVT::i32)) + Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result); + return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); + } + + SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32); + + // Convert vector index to bit-index. + SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen); + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx); + + SDValue Result = Elt; + if (ResultVT.bitsLT(MVT::i32)) + Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result); + + return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); +} + bool SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // We can fold offsets for anything that doesn't require a GOT relocation. - return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || - GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) && + return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || + GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && !shouldEmitGOTReloc(GA->getGlobal()); } -static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, - SDLoc DL, unsigned Offset, EVT PtrVT, - unsigned GAFlags = SIInstrInfo::MO_NONE) { +static SDValue +buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, + const SDLoc &DL, unsigned Offset, EVT PtrVT, + unsigned GAFlags = SIInstrInfo::MO_NONE) { // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is // lowered to the following code sequence: // @@ -2265,8 +2918,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); - if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && - GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) + if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS && + GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); SDLoc DL(GSD); @@ -2283,7 +2936,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SIInstrInfo::MO_GOTPCREL32); Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); const DataLayout &DataLayout = DAG.getDataLayout(); unsigned Align = DataLayout.getABITypeAlignment(PtrTy); // FIXME: Use a PseudoSourceValue once those can be assigned an address space. @@ -2294,23 +2947,6 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, MachineMemOperand::MOInvariant); } -SDValue SITargetLowering::lowerTRAP(SDValue Op, - SelectionDAG &DAG) const { - const MachineFunction &MF = DAG.getMachineFunction(); - DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), - "trap handler not supported", - Op.getDebugLoc(), - DS_Warning); - DAG.getContext()->diagnose(NoTrap); - - // Emit s_endpgm. - - // FIXME: This should really be selected to s_trap, but that requires - // setting up the trap handler for it o do anything. - return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other, - Op.getOperand(0)); -} - SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const { // We can't use S_MOV_B32 directly, because there is no way to specify m0 as @@ -2332,14 +2968,15 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, MVT VT, unsigned Offset) const { SDLoc SL(Op); - SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, - DAG.getEntryNode(), Offset, false); + SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL, + DAG.getEntryNode(), Offset, false); // The local size values will have the hi 16-bits as zero. return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, DAG.getValueType(VT)); } -static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { +static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, + EVT VT) { DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), "non-hsa intrinsic with hsa target", DL.getDebugLoc()); @@ -2347,7 +2984,8 @@ static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { return DAG.getUNDEF(VT); } -static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { +static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, + EVT VT) { DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), "intrinsic not supported on subtarget", DL.getDebugLoc()); @@ -2369,7 +3007,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { case Intrinsic::amdgcn_implicit_buffer_ptr: { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + if (getSubtarget()->isAmdCodeObjectV2(MF)) + return emitNonHSAIntrinsicError(DAG, DL, VT); + + unsigned Reg = TRI->getPreloadedValue(MF, + SIRegisterInfo::IMPLICIT_BUFFER_PTR); return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); } case Intrinsic::amdgcn_dispatch_ptr: @@ -2389,7 +3031,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_implicitarg_ptr: { unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); - return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset); + return lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), offset); } case Intrinsic::amdgcn_kernarg_segment_ptr: { unsigned Reg @@ -2403,19 +3045,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_rcp: return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_rsq: - case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); - case Intrinsic::amdgcn_rsq_legacy: { + case Intrinsic::amdgcn_rsq_legacy: if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) return emitRemovedIntrinsicError(DAG, DL, VT); return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); - } - case Intrinsic::amdgcn_rcp_legacy: { + case Intrinsic::amdgcn_rcp_legacy: if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) return emitRemovedIntrinsicError(DAG, DL, VT); return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1)); - } case Intrinsic::amdgcn_rsq_clamp: { if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); @@ -2434,38 +3073,38 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_X, false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Y, false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Z, false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_X, false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); @@ -2522,43 +3161,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, Op->getVTList(), Ops, VT, MMO); } - case AMDGPUIntrinsic::amdgcn_fdiv_fast: { + case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); - } - case AMDGPUIntrinsic::SI_vs_load_input: - return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::SI_fs_constant: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); - SDValue Glue = M0.getValue(1); - return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, - DAG.getConstant(2, DL, MVT::i32), // P0 - Op.getOperand(1), Op.getOperand(2), Glue); - } - case AMDGPUIntrinsic::SI_packf16: - if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) - return DAG.getUNDEF(MVT::i32); - return Op; - case AMDGPUIntrinsic::SI_fs_interp: { - SDValue IJ = Op.getOperand(4); - SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, - DAG.getConstant(0, DL, MVT::i32)); - SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, - DAG.getConstant(1, DL, MVT::i32)); - I = DAG.getNode(ISD::BITCAST, DL, MVT::f32, I); - J = DAG.getNode(ISD::BITCAST, DL, MVT::f32, J); - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); - SDValue Glue = M0.getValue(1); - SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, - DAG.getVTList(MVT::f32, MVT::Glue), - I, Op.getOperand(1), Op.getOperand(2), Glue); - Glue = SDValue(P1.getNode(), 1); - return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, - Op.getOperand(1), Op.getOperand(2), Glue); - } case Intrinsic::amdgcn_interp_mov: { SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); SDValue Glue = M0.getValue(1); @@ -2639,10 +3243,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_icmp: { const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); - int CondCode = CD->getSExtValue(); + if (!CD) + return DAG.getUNDEF(VT); + int CondCode = CD->getSExtValue(); if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || - CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE) + CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE) return DAG.getUNDEF(VT); ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); @@ -2652,10 +3258,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_fcmp: { const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); - int CondCode = CD->getSExtValue(); + if (!CD) + return DAG.getUNDEF(VT); - if (CondCode <= FCmpInst::Predicate::FCMP_FALSE || - CondCode >= FCmpInst::Predicate::FCMP_TRUE) + int CondCode = CD->getSExtValue(); + if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE || + CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) return DAG.getUNDEF(VT); FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); @@ -2663,14 +3271,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), Op.getOperand(2), DAG.getCondCode(CCOpcode)); } + case Intrinsic::amdgcn_fmed3: + return DAG.getNode(AMDGPUISD::FMED3, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_fmul_legacy: return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), Op.getOperand(2)); case Intrinsic::amdgcn_sffbh: - case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name. return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_sbfe: + return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_ubfe: + return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_cvt_pkrtz: { + // FIXME: Stop adding cast if v2f16 legal. + EVT VT = Op.getValueType(); + SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32, + Op.getOperand(1), Op.getOperand(2)); + return DAG.getNode(ISD::BITCAST, DL, VT, Node); + } default: - return AMDGPUTargetLowering::LowerOperation(Op, DAG); + return Op; } } @@ -2678,6 +3301,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); SDLoc DL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: { @@ -2703,7 +3328,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // glc Op.getOperand(6) // slc }; - MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? @@ -2718,6 +3342,87 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); } + case Intrinsic::amdgcn_tbuffer_load: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + Op.getOperand(5), // soffset + Op.getOperand(6), // offset + Op.getOperand(7), // dfmt + Op.getOperand(8), // nfmt + Op.getOperand(9), // glc + Op.getOperand(10) // slc + }; + + EVT VT = Op.getOperand(2).getValueType(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad, + VT.getStoreSize(), VT.getStoreSize()); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, VT, MMO); + } + // Basic sample. + case Intrinsic::amdgcn_image_sample: + case Intrinsic::amdgcn_image_sample_cl: + case Intrinsic::amdgcn_image_sample_d: + case Intrinsic::amdgcn_image_sample_d_cl: + case Intrinsic::amdgcn_image_sample_l: + case Intrinsic::amdgcn_image_sample_b: + case Intrinsic::amdgcn_image_sample_b_cl: + case Intrinsic::amdgcn_image_sample_lz: + case Intrinsic::amdgcn_image_sample_cd: + case Intrinsic::amdgcn_image_sample_cd_cl: + + // Sample with comparison. + case Intrinsic::amdgcn_image_sample_c: + case Intrinsic::amdgcn_image_sample_c_cl: + case Intrinsic::amdgcn_image_sample_c_d: + case Intrinsic::amdgcn_image_sample_c_d_cl: + case Intrinsic::amdgcn_image_sample_c_l: + case Intrinsic::amdgcn_image_sample_c_b: + case Intrinsic::amdgcn_image_sample_c_b_cl: + case Intrinsic::amdgcn_image_sample_c_lz: + case Intrinsic::amdgcn_image_sample_c_cd: + case Intrinsic::amdgcn_image_sample_c_cd_cl: + + // Sample with offsets. + case Intrinsic::amdgcn_image_sample_o: + case Intrinsic::amdgcn_image_sample_cl_o: + case Intrinsic::amdgcn_image_sample_d_o: + case Intrinsic::amdgcn_image_sample_d_cl_o: + case Intrinsic::amdgcn_image_sample_l_o: + case Intrinsic::amdgcn_image_sample_b_o: + case Intrinsic::amdgcn_image_sample_b_cl_o: + case Intrinsic::amdgcn_image_sample_lz_o: + case Intrinsic::amdgcn_image_sample_cd_o: + case Intrinsic::amdgcn_image_sample_cd_cl_o: + + // Sample with comparison and offsets. + case Intrinsic::amdgcn_image_sample_c_o: + case Intrinsic::amdgcn_image_sample_c_cl_o: + case Intrinsic::amdgcn_image_sample_c_d_o: + case Intrinsic::amdgcn_image_sample_c_d_cl_o: + case Intrinsic::amdgcn_image_sample_c_l_o: + case Intrinsic::amdgcn_image_sample_c_b_o: + case Intrinsic::amdgcn_image_sample_c_b_cl_o: + case Intrinsic::amdgcn_image_sample_c_lz_o: + case Intrinsic::amdgcn_image_sample_c_cd_o: + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + + case Intrinsic::amdgcn_image_getlod: { + // Replace dmask with everything disabled with undef. + const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5)); + if (!DMask || DMask->isNullValue()) { + SDValue Undef = DAG.getUNDEF(Op.getValueType()); + return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op)); + } + + return SDValue(); + } default: return SDValue(); } @@ -2725,51 +3430,75 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + MachineFunction &MF = DAG.getMachineFunction(); switch (IntrinsicID) { - case AMDGPUIntrinsic::SI_sendmsg: - case Intrinsic::amdgcn_s_sendmsg: { - Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); - SDValue Glue = Chain.getValue(1); - return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, - Op.getOperand(2), Glue); + case Intrinsic::amdgcn_exp: { + const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2)); + const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3)); + const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8)); + const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9)); + + const SDValue Ops[] = { + Chain, + DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt + DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en + Op.getOperand(4), // src0 + Op.getOperand(5), // src1 + Op.getOperand(6), // src2 + Op.getOperand(7), // src3 + DAG.getTargetConstant(0, DL, MVT::i1), // compr + DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1) + }; + + unsigned Opc = Done->isNullValue() ? + AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; + return DAG.getNode(Opc, DL, Op->getVTList(), Ops); + } + case Intrinsic::amdgcn_exp_compr: { + const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2)); + const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3)); + SDValue Src0 = Op.getOperand(4); + SDValue Src1 = Op.getOperand(5); + const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6)); + const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7)); + + SDValue Undef = DAG.getUNDEF(MVT::f32); + const SDValue Ops[] = { + Chain, + DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt + DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en + DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), + DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), + Undef, // src2 + Undef, // src3 + DAG.getTargetConstant(1, DL, MVT::i1), // compr + DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1) + }; + + unsigned Opc = Done->isNullValue() ? + AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; + return DAG.getNode(Opc, DL, Op->getVTList(), Ops); } + case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { + unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ? + AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT; Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); SDValue Glue = Chain.getValue(1); - return DAG.getNode(AMDGPUISD::SENDMSGHALT, DL, MVT::Other, Chain, + return DAG.getNode(NodeOp, DL, MVT::Other, Chain, Op.getOperand(2), Glue); } - case AMDGPUIntrinsic::SI_tbuffer_store: { - SDValue Ops[] = { - Chain, - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4), - Op.getOperand(5), - Op.getOperand(6), - Op.getOperand(7), - Op.getOperand(8), - Op.getOperand(9), - Op.getOperand(10), - Op.getOperand(11), - Op.getOperand(12), - Op.getOperand(13), - Op.getOperand(14) - }; - - EVT VT = Op.getOperand(3).getValueType(); - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); + case Intrinsic::amdgcn_init_exec: { + return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain, + Op.getOperand(2)); + } + case Intrinsic::amdgcn_init_exec_from_input: { + return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain, + Op.getOperand(2), Op.getOperand(3)); } case AMDGPUIntrinsic::AMDGPU_kill: { SDValue Src = Op.getOperand(2); @@ -2784,31 +3513,87 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast); } - case AMDGPUIntrinsic::SI_export: { - const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2)); - const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3)); - const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4)); - const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5)); - const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6)); + case Intrinsic::amdgcn_s_barrier: { + if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second; + if (WGSize <= ST.getWavefrontSize()) + return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, + Op.getOperand(0)), 0); + } + return SDValue(); + }; + case AMDGPUIntrinsic::SI_tbuffer_store: { - const SDValue Ops[] = { - Chain, - DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), - DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1), - DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), - DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1), - Op.getOperand(7), // src0 - Op.getOperand(8), // src1 - Op.getOperand(9), // src2 - Op.getOperand(10) // src3 + // Extract vindex and voffset from vaddr as appropriate + const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10)); + const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11)); + SDValue VAddr = Op.getOperand(5); + + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); + + assert(!(OffEn->isOne() && IdxEn->isOne()) && + "Legacy intrinsic doesn't support both offset and index - use new version"); + + SDValue VIndex = IdxEn->isOne() ? VAddr : Zero; + SDValue VOffset = OffEn->isOne() ? VAddr : Zero; + + // Deal with the vec-3 case + const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4)); + auto Opcode = NumChannels->getZExtValue() == 3 ? + AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT; + + SDValue Ops[] = { + Chain, + Op.getOperand(3), // vdata + Op.getOperand(2), // rsrc + VIndex, + VOffset, + Op.getOperand(6), // soffset + Op.getOperand(7), // inst_offset + Op.getOperand(8), // dfmt + Op.getOperand(9), // nfmt + Op.getOperand(12), // glc + Op.getOperand(13), // slc }; - unsigned Opc = Done->isNullValue() ? - AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; - return DAG.getNode(Opc, DL, Op->getVTList(), Ops); + assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 && + "Value of tfe other than zero is unsupported"); + + EVT VT = Op.getOperand(3).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(Opcode, DL, + Op->getVTList(), Ops, VT, MMO); + } + + case Intrinsic::amdgcn_tbuffer_store: { + SDValue Ops[] = { + Chain, + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // voffset + Op.getOperand(6), // soffset + Op.getOperand(7), // offset + Op.getOperand(8), // dfmt + Op.getOperand(9), // nfmt + Op.getOperand(10), // glc + Op.getOperand(11) // slc + }; + EVT VT = Op.getOperand(3).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, + Op->getVTList(), Ops, VT, MMO); } + default: - return SDValue(); + return Op; } } @@ -2857,36 +3642,36 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. - if (AS == AMDGPUAS::FLAT_ADDRESS) + if (AS == AMDGPUASI.FLAT_ADDRESS) AS = MFI->hasFlatScratchInit() ? - AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; + AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; unsigned NumElements = MemVT.getVectorNumElements(); - switch (AS) { - case AMDGPUAS::CONSTANT_ADDRESS: + if (AS == AMDGPUASI.CONSTANT_ADDRESS) { if (isMemOpUniform(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. // - LLVM_FALLTHROUGH; - case AMDGPUAS::GLOBAL_ADDRESS: { + } + if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && - isMemOpHasNoClobberedMemOperand(Load)) + !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. // } - LLVM_FALLTHROUGH; - case AMDGPUAS::FLAT_ADDRESS: + if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS || + AS == AMDGPUASI.FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); // v4 loads are supported for private and global memory. return SDValue(); - case AMDGPUAS::PRIVATE_ADDRESS: { + } + if (AS == AMDGPUASI.PRIVATE_ADDRESS) { // Depending on the setting of the private_element_size field in the // resource descriptor, we can only make private accesses up to a certain // size. @@ -2905,8 +3690,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("unsupported private_element_size"); } - } - case AMDGPUAS::LOCAL_ADDRESS: { + } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { if (NumElements > 2) return SplitVectorLoad(Op, DAG); @@ -2916,9 +3700,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // If properly aligned, if we split we might be able to use ds_read_b64. return SplitVectorLoad(Op, DAG); } - default: - return SDValue(); - } + return SDValue(); } SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -2956,11 +3738,15 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); EVT VT = Op.getValueType(); - bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; + const SDNodeFlags Flags = Op->getFlags(); + bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || + Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal(); + + if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals()) + return SDValue(); if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { - if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || - VT == MVT::f16) { + if (Unsafe || VT == MVT::f32 || VT == MVT::f16) { if (CLHS->isExactlyValue(1.0)) { // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to // the CI documentation has a worst case error of 1 ulp. @@ -2989,15 +3775,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, } } - const SDNodeFlags *Flags = Op->getFlags(); - - if (Unsafe || Flags->hasAllowReciprocal()) { + if (Unsafe) { // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) - SDNodeFlags Flags; - Flags.setUnsafeAlgebra(true); SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); } return SDValue(); @@ -3287,18 +4069,17 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. - if (AS == AMDGPUAS::FLAT_ADDRESS) + if (AS == AMDGPUASI.FLAT_ADDRESS) AS = MFI->hasFlatScratchInit() ? - AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; + AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; unsigned NumElements = VT.getVectorNumElements(); - switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::FLAT_ADDRESS: + if (AS == AMDGPUASI.GLOBAL_ADDRESS || + AS == AMDGPUASI.FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorStore(Op, DAG); return SDValue(); - case AMDGPUAS::PRIVATE_ADDRESS: { + } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { switch (Subtarget->getMaxPrivateElementSize()) { case 4: return scalarizeVectorStore(Store, DAG); @@ -3313,8 +4094,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("unsupported private_element_size"); } - } - case AMDGPUAS::LOCAL_ADDRESS: { + } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { if (NumElements > 2) return SplitVectorStore(Op, DAG); @@ -3323,8 +4103,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // If properly aligned, if we split we might be able to use ds_write_b64. return SplitVectorStore(Op, DAG); - } - default: + } else { llvm_unreachable("unhandled address space"); } } @@ -3355,7 +4134,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co unsigned AS = AtomicNode->getAddressSpace(); // No custom lowering required for local address space - if (!isFlatGlobalAddrSpace(AS)) + if (!isFlatGlobalAddrSpace(AS, AMDGPUASI)) return Op; // Non-local address space requires custom lowering for atomic compare @@ -3412,12 +4191,12 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, /// the immediate offsets of a memory instruction for the given address space. static bool canFoldOffset(unsigned OffsetSize, unsigned AS, const SISubtarget &STI) { - switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: { + auto AMDGPUASI = STI.getAMDGPUAS(); + if (AS == AMDGPUASI.GLOBAL_ADDRESS) { // MUBUF instructions a 12-bit offset in bytes. return isUInt<12>(OffsetSize); } - case AMDGPUAS::CONSTANT_ADDRESS: { + if (AS == AMDGPUASI.CONSTANT_ADDRESS) { // SMRD instructions have an 8-bit offset in dwords on SI and // a 20-bit offset in bytes on VI. if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) @@ -3425,16 +4204,13 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS, else return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); } - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::REGION_ADDRESS: { + if (AS == AMDGPUASI.LOCAL_ADDRESS || + AS == AMDGPUASI.REGION_ADDRESS) { // The single offset versions have a 16-bit offset in bytes. return isUInt<16>(OffsetSize); } - case AMDGPUAS::PRIVATE_ADDRESS: // Indirect register addressing does not use any offsets. - default: - return 0; - } + return false; } // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) @@ -3492,7 +4268,7 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, // TODO: We could also do this for multiplies. unsigned AS = N->getAddressSpace(); - if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { + if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) { SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); if (NewPtr) { SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end()); @@ -3538,6 +4314,23 @@ SDValue SITargetLowering::splitBinaryBitConstantOp( return SDValue(); } +// Returns true if argument is a boolean value which is not serialized into +// memory or argument and does not require v_cmdmask_b32 to be deserialized. +static bool isBoolSGPR(SDValue V) { + if (V.getValueType() != MVT::i1) + return false; + switch (V.getOpcode()) { + default: break; + case ISD::SETCC: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case AMDGPUISD::FP_CLASS: + return true; + } + return false; +} + SDValue SITargetLowering::performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.isBeforeLegalize()) @@ -3549,12 +4342,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, SDValue RHS = N->getOperand(1); - if (VT == MVT::i64) { - const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); - if (CRHS) { - if (SDValue Split - = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) - return Split; + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); + if (VT == MVT::i64 && CRHS) { + if (SDValue Split + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) + return Split; + } + + if (CRHS && VT == MVT::i32) { + // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb + // nb = number of trailing zeroes in mask + // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, + // given that we are selecting 8 or 16 bit fields starting at byte boundary. + uint64_t Mask = CRHS->getZExtValue(); + unsigned Bits = countPopulation(Mask); + if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && + (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) { + if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { + unsigned Shift = CShift->getZExtValue(); + unsigned NB = CRHS->getAPIntValue().countTrailingZeros(); + unsigned Offset = NB + Shift; + if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. + SDLoc SL(N); + SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + LHS->getOperand(0), + DAG.getConstant(Offset, SL, MVT::i32), + DAG.getConstant(Bits, SL, MVT::i32)); + EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits); + SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE, + DAG.getValueType(NarrowVT)); + SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext, + DAG.getConstant(NB, SDLoc(CRHS), MVT::i32)); + return Shl; + } + } } } @@ -3598,6 +4419,16 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, } } + if (VT == MVT::i32 && + (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) { + // and x, (sext cc from i1) => select cc, x, 0 + if (RHS.getOpcode() != ISD::SIGN_EXTEND) + std::swap(LHS, RHS); + if (isBoolSGPR(RHS.getOperand(0))) + return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), + LHS, DAG.getConstant(0, SDLoc(N), MVT::i32)); + } + return SDValue(); } @@ -3692,6 +4523,88 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, return SDValue(); } +// Instructions that will be lowered with a final instruction that zeros the +// high result bits. +// XXX - probably only need to list legal operations. +static bool fp16SrcZerosHighBits(unsigned Opc) { + switch (Opc) { + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FDIV: + case ISD::FREM: + case ISD::FMA: + case ISD::FMAD: + case ISD::FCANONICALIZE: + case ISD::FP_ROUND: + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: + case ISD::FABS: + // Fabs is lowered to a bit operation, but it's an and which will clear the + // high bits anyway. + case ISD::FSQRT: + case ISD::FSIN: + case ISD::FCOS: + case ISD::FPOWI: + case ISD::FPOW: + case ISD::FLOG: + case ISD::FLOG2: + case ISD::FLOG10: + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FFLOOR: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case AMDGPUISD::FRACT: + case AMDGPUISD::CLAMP: + case AMDGPUISD::COS_HW: + case AMDGPUISD::SIN_HW: + case AMDGPUISD::FMIN3: + case AMDGPUISD::FMAX3: + case AMDGPUISD::FMED3: + case AMDGPUISD::FMAD_FTZ: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::LDEXP: + return true; + default: + // fcopysign, select and others may be lowered to 32-bit bit operations + // which don't zero the high bits. + return false; + } +} + +SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!Subtarget->has16BitInsts() || + DCI.getDAGCombineLevel() < AfterLegalizeDAG) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i32) + return SDValue(); + + SDValue Src = N->getOperand(0); + if (Src.getValueType() != MVT::i16) + return SDValue(); + + // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src + // FIXME: It is not universally true that the high bits are zeroed on gfx9. + if (Src.getOpcode() == ISD::BITCAST) { + SDValue BCSrc = Src.getOperand(0); + if (BCSrc.getValueType() == MVT::f16 && + fp16SrcZerosHighBits(BCSrc.getOpcode())) + return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc); + } + + return SDValue(); +} + SDValue SITargetLowering::performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -3709,27 +4622,123 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, return SDValue(); } +static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { + if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) + return true; + + return DAG.isKnownNeverNaN(Op); +} + +static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, + const SISubtarget *ST, unsigned MaxDepth=5) { + // If source is a result of another standard FP operation it is already in + // canonical form. + + switch (Op.getOpcode()) { + default: + break; + + // These will flush denorms if required. + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FSQRT: + case ISD::FCEIL: + case ISD::FFLOOR: + case ISD::FMA: + case ISD::FMAD: + + case ISD::FCANONICALIZE: + return true; + + case ISD::FP_ROUND: + return Op.getValueType().getScalarType() != MVT::f16 || + ST->hasFP16Denormals(); + + case ISD::FP_EXTEND: + return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 || + ST->hasFP16Denormals(); + + case ISD::FP16_TO_FP: + case ISD::FP_TO_FP16: + return ST->hasFP16Denormals(); + + // It can/will be lowered or combined as a bit operation. + // Need to check their input recursively to handle. + case ISD::FNEG: + case ISD::FABS: + return (MaxDepth > 0) && + isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1); + + case ISD::FSIN: + case ISD::FCOS: + case ISD::FSINCOS: + return Op.getValueType().getScalarType() != MVT::f16; + + // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. + // For such targets need to check their input recursively. + case ISD::FMINNUM: + case ISD::FMAXNUM: + case ISD::FMINNAN: + case ISD::FMAXNAN: + + if (ST->supportsMinMaxDenormModes() && + DAG.isKnownNeverNaN(Op.getOperand(0)) && + DAG.isKnownNeverNaN(Op.getOperand(1))) + return true; + + return (MaxDepth > 0) && + isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) && + isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1); + + case ISD::ConstantFP: { + auto F = cast<ConstantFPSDNode>(Op)->getValueAPF(); + return !F.isDenormal() && !(F.isNaN() && F.isSignaling()); + } + } + return false; +} + // Constant fold canonicalize. SDValue SITargetLowering::performFCanonicalizeCombine( SDNode *N, DAGCombinerInfo &DCI) const { - ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); - if (!CFP) + SelectionDAG &DAG = DCI.DAG; + ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0)); + + if (!CFP) { + SDValue N0 = N->getOperand(0); + EVT VT = N0.getValueType().getScalarType(); + auto ST = getSubtarget(); + + if (((VT == MVT::f32 && ST->hasFP32Denormals()) || + (VT == MVT::f64 && ST->hasFP64Denormals()) || + (VT == MVT::f16 && ST->hasFP16Denormals())) && + DAG.isKnownNeverNaN(N0)) + return N0; + + bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + + if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) && + isCanonicalized(DAG, N0, ST)) + return N0; + return SDValue(); + } - SelectionDAG &DAG = DCI.DAG; const APFloat &C = CFP->getValueAPF(); // Flush denormals to 0 if not enabled. if (C.isDenormal()) { EVT VT = N->getValueType(0); - if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) + EVT SVT = VT.getScalarType(); + if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals()) return DAG.getConstantFP(0.0, SDLoc(N), VT); - if (VT == MVT::f64 && !Subtarget->hasFP64Denormals()) + if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals()) return DAG.getConstantFP(0.0, SDLoc(N), VT); - if (VT == MVT::f16 && !Subtarget->hasFP16Denormals()) + if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals()) return DAG.getConstantFP(0.0, SDLoc(N), VT); } @@ -3749,7 +4758,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine( return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); } - return SDValue(CFP, 0); + return N->getOperand(0); } static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { @@ -3771,8 +4780,9 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { } } -static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1, bool Signed) { +SDValue SITargetLowering::performIntMed3ImmCombine( + SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1, bool Signed) const { ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1); if (!K1) return SDValue(); @@ -3790,34 +4800,28 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, } EVT VT = K0->getValueType(0); + unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; + if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) { + return DAG.getNode(Med3Opc, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + } + // If there isn't a 16-bit med3 operation, convert to 32-bit. MVT NVT = MVT::i32; unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - SDValue Tmp1, Tmp2, Tmp3; - Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); - Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); - Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); - - if (VT == MVT::i16) { - Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT, - Tmp1, Tmp2, Tmp3); + SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); + SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); + SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); - return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1); - } else - return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, - Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); -} - -static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { - if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) - return true; - - return DAG.isKnownNeverNaN(Op); + SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); + return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); } -static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1) { +SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Op0, + SDValue Op1) const { ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1); if (!K1) return SDValue(); @@ -3831,6 +4835,20 @@ static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, if (Cmp == APFloat::cmpGreaterThan) return SDValue(); + // TODO: Check IEEE bit enabled? + EVT VT = K0->getValueType(0); + if (Subtarget->enableDX10Clamp()) { + // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the + // hardware fmed3 behavior converting to a min. + // FIXME: Should this be allowing -0.0? + if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0)) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0)); + } + + // med3 for f16 is only available on gfx9+. + if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16())) + return SDValue(); + // This isn't safe with signaling NaNs because in IEEE mode, min/max on a // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then // give the other result, which is different from med3 with a NaN input. @@ -3846,6 +4864,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); unsigned Opc = N->getOpcode(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -3853,7 +4872,10 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) { + + if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && + VT != MVT::f64 && + ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) { // max(max(a, b), c) -> max3(a, b, c) // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { @@ -3895,7 +4917,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || (Opc == AMDGPUISD::FMIN_LEGACY && Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && - N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { + (VT == MVT::f32 || VT == MVT::f64 || + (VT == MVT::f16 && Subtarget->has16BitInsts())) && + Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) return Res; } @@ -3903,6 +4927,87 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, return SDValue(); } +static bool isClampZeroToOne(SDValue A, SDValue B) { + if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) { + if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) { + // FIXME: Should this be allowing -0.0? + return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) || + (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0)); + } + } + + return false; +} + +// FIXME: Should only worry about snans for version with chain. +SDValue SITargetLowering::performFMed3Combine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and + // NaNs. With a NaN input, the order of the operands may change the result. + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + SDValue Src2 = N->getOperand(2); + + if (isClampZeroToOne(Src0, Src1)) { + // const_a, const_b, x -> clamp is safe in all cases including signaling + // nans. + // FIXME: Should this be allowing -0.0? + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); + } + + // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother + // handling no dx10-clamp? + if (Subtarget->enableDX10Clamp()) { + // If NaNs is clamped to 0, we are free to reorder the inputs. + + if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) + std::swap(Src0, Src1); + + if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2)) + std::swap(Src1, Src2); + + if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) + std::swap(Src0, Src1); + + if (isClampZeroToOne(Src1, Src2)) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0); + } + + return SDValue(); +} + +SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + if (Src0.isUndef() && Src1.isUndef()) + return DCI.DAG.getUNDEF(N->getValueType(0)); + return SDValue(); +} + +SDValue SITargetLowering::performExtractVectorEltCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + SDValue Vec = N->getOperand(0); + + SelectionDAG &DAG= DCI.DAG; + if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) { + SDLoc SL(N); + EVT EltVT = N->getValueType(0); + SDValue Idx = N->getOperand(1); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(0), Idx); + return DAG.getNode(ISD::FNEG, SL, EltVT, Elt); + } + + return SDValue(); +} + + unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const { @@ -3915,10 +5020,9 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return ISD::FMAD; const TargetOptions &Options = DAG.getTarget().Options; - if ((Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || - (cast<BinaryWithFlagsSDNode>(N0)->Flags.hasUnsafeAlgebra() && - cast<BinaryWithFlagsSDNode>(N1)->Flags.hasUnsafeAlgebra())) && + if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + (N0->getFlags().hasUnsafeAlgebra() && + N1->getFlags().hasUnsafeAlgebra())) && isFMAFasterThanFMulAndFAdd(VT)) { return ISD::FMA; } @@ -3926,6 +5030,102 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return 0; } +SDValue SITargetLowering::performAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + if (VT != MVT::i32) + return SDValue(); + + SDLoc SL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // add x, zext (setcc) => addcarry x, 0, setcc + // add x, sext (setcc) => subcarry x, 0, setcc + unsigned Opc = LHS.getOpcode(); + if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND || + Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY) + std::swap(RHS, LHS); + + Opc = RHS.getOpcode(); + switch (Opc) { + default: break; + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ANY_EXTEND: { + auto Cond = RHS.getOperand(0); + if (!isBoolSGPR(Cond)) + break; + SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); + SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; + Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY; + return DAG.getNode(Opc, SL, VTList, Args); + } + case ISD::ADDCARRY: { + // add x, (addcarry y, 0, cc) => addcarry x, y, cc + auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); + if (!C || C->getZExtValue() != 0) break; + SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) }; + return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args); + } + } + return SDValue(); +} + +SDValue SITargetLowering::performSubCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + if (VT != MVT::i32) + return SDValue(); + + SDLoc SL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + unsigned Opc = LHS.getOpcode(); + if (Opc != ISD::SUBCARRY) + std::swap(RHS, LHS); + + if (LHS.getOpcode() == ISD::SUBCARRY) { + // sub (subcarry x, 0, cc), y => subcarry x, y, cc + auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); + if (!C || C->getZExtValue() != 0) + return SDValue(); + SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) }; + return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args); + } + return SDValue(); +} + +SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + + if (N->getValueType(0) != MVT::i32) + return SDValue(); + + auto C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!C || C->getZExtValue() != 0) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + + // addcarry (add x, y), 0, cc => addcarry x, y, cc + // subcarry (sub x, y), 0, cc => subcarry x, y, cc + unsigned LHSOpc = LHS.getOpcode(); + unsigned Opc = N->getOpcode(); + if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) || + (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) { + SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) }; + return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args); + } + return SDValue(); +} + SDValue SITargetLowering::performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) @@ -3933,7 +5133,6 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); - assert(!VT.isVector()); SDLoc SL(N); SDValue LHS = N->getOperand(0); @@ -4024,6 +5223,35 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); EVT VT = LHS.getValueType(); + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + + auto CRHS = dyn_cast<ConstantSDNode>(RHS); + if (!CRHS) { + CRHS = dyn_cast<ConstantSDNode>(LHS); + if (CRHS) { + std::swap(LHS, RHS); + CC = getSetCCSwappedOperands(CC); + } + } + + if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && + isBoolSGPR(LHS.getOperand(0))) { + // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 + // setcc (sext from i1 cc), -1, eq|sle|uge) => cc + // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 + // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc + if ((CRHS->isAllOnesValue() && + (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || + (CRHS->isNullValue() && + (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) + return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), + DAG.getConstant(-1, SL, MVT::i1)); + if ((CRHS->isAllOnesValue() && + (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || + (CRHS->isNullValue() && + (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) + return LHS.getOperand(0); + } if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() && VT != MVT::f16)) @@ -4031,7 +5259,6 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, // Match isinf pattern // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); if (!CRHS) @@ -4080,12 +5307,12 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); - APInt KnownZero, KnownOne; + KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(Src, Demanded) || - TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { + if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) || + TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) { DCI.CommitTargetLoweringOpt(TLO); } @@ -4097,6 +5324,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + case ISD::ADD: + return performAddCombine(N, DCI); + case ISD::SUB: + return performSubCombine(N, DCI); + case ISD::ADDCARRY: + case ISD::SUBCARRY: + return performAddCarrySubCarryCombine(N, DCI); case ISD::FADD: return performFAddCombine(N, DCI); case ISD::FSUB: @@ -4112,7 +5346,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: { if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && - N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) return performMinMaxCombine(N, DCI); break; @@ -4135,17 +5368,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case AMDGPUISD::ATOMIC_INC: - case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics. + case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; return performMemSDNodeCombine(cast<MemSDNode>(N), DCI); - } case ISD::AND: return performAndCombine(N, DCI); case ISD::OR: return performOrCombine(N, DCI); case ISD::XOR: return performXorCombine(N, DCI); + case ISD::ZERO_EXTEND: + return performZeroExtendCombine(N, DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); case ISD::FCANONICALIZE: @@ -4170,6 +5404,30 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::CVT_F32_UBYTE2: case AMDGPUISD::CVT_F32_UBYTE3: return performCvtF32UByteNCombine(N, DCI); + case AMDGPUISD::FMED3: + return performFMed3Combine(N, DCI); + case AMDGPUISD::CVT_PKRTZ_F16_F32: + return performCvtPkRTZCombine(N, DCI); + case ISD::SCALAR_TO_VECTOR: { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) + if (VT == MVT::v2i16 || VT == MVT::v2f16) { + SDLoc SL(N); + SDValue Src = N->getOperand(0); + EVT EltVT = Src.getValueType(); + if (EltVT == MVT::f16) + Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src); + + SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src); + return DAG.getNode(ISD::BITCAST, SL, VT, Ext); + } + + break; + } + case ISD::EXTRACT_VECTOR_ELT: + return performExtractVectorEltCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } @@ -4198,6 +5456,10 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); I != E; ++I) { + // Don't look at users of the chain. + if (I.getUse().getResNo() != 0) + continue; + // Abort if we can't understand the usage if (!I->isMachineOpcode() || I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) @@ -4250,7 +5512,6 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Update the users of the node with the new indices for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { - SDNode *User = Users[i]; if (!User) continue; @@ -4277,8 +5538,33 @@ static bool isFrameIndexOp(SDValue Op) { /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) /// with frame index operands. /// LLVM assumes that inputs are to these instructions are registers. -void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, - SelectionDAG &DAG) const { +SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, + SelectionDAG &DAG) const { + if (Node->getOpcode() == ISD::CopyToReg) { + RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1)); + SDValue SrcVal = Node->getOperand(2); + + // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have + // to try understanding copies to physical registers. + if (SrcVal.getValueType() == MVT::i1 && + TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) { + SDLoc SL(Node); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + SDValue VReg = DAG.getRegister( + MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1); + + SDNode *Glued = Node->getGluedNode(); + SDValue ToVReg + = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal, + SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0)); + SDValue ToResultReg + = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0), + VReg, ToVReg.getValue(1)); + DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode()); + DAG.RemoveDeadNode(Node); + return ToResultReg.getNode(); + } + } SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < Node->getNumOperands(); ++i) { @@ -4294,6 +5580,7 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, } DAG.UpdateNodeOperands(Node, Ops); + return Node; } /// \brief Fold the instructions after selecting them. @@ -4460,15 +5747,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); } -SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const { - SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); - - return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), - cast<RegisterSDNode>(VReg)->getReg(), VT); -} - //===----------------------------------------------------------------------===// // SI Inline Assembly Support //===----------------------------------------------------------------------===// @@ -4496,6 +5774,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &AMDGPU::SReg_128RegClass); case 256: return std::make_pair(0U, &AMDGPU::SReg_256RegClass); + case 512: + return std::make_pair(0U, &AMDGPU::SReg_512RegClass); } case 'v': @@ -4549,3 +5829,44 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { } return TargetLowering::getConstraintType(Constraint); } + +// Figure out which registers should be reserved for stack access. Only after +// the function is legalized do we know all of the non-spill stack objects or if +// calls are present. +void SITargetLowering::finalizeLowering(MachineFunction &MF) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + if (Info->isEntryFunction()) { + // Callable functions have fixed registers used for stack access. + reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); + } + + // We have to assume the SP is needed in case there are calls in the function + // during lowering. Calls are only detected after the function is + // lowered. We're about to reserve registers, so don't bother using it if we + // aren't really going to use it. + bool NeedSP = !Info->isEntryFunction() || + MFI.hasVarSizedObjects() || + MFI.hasCalls(); + + if (NeedSP) { + unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF); + Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg); + + assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg()); + assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), + Info->getStackPtrOffsetReg())); + MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); + } + + MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); + MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); + MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, + Info->getScratchWaveOffsetReg()); + + TargetLoweringBase::finalizeLowering(MF); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h index 6c04e4f..e6bb3d6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -21,11 +21,17 @@ namespace llvm { class SITargetLowering final : public AMDGPUTargetLowering { - SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, - unsigned Offset) const; - SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, - SDValue Chain, unsigned Offset, bool Signed, - const ISD::InputArg *Arg = nullptr) const; + SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, + SDValue Chain, uint64_t Offset) const; + SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Chain, + uint64_t Offset, bool Signed, + const ISD::InputArg *Arg = nullptr) const; + + SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, + const SDLoc &SL, SDValue Chain, + const ISD::InputArg &Arg) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, @@ -55,11 +61,19 @@ class SITargetLowering final : public AMDGPUTargetLowering { const SDLoc &DL, EVT VT) const; + SDValue convertArgType( + SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, + bool Signed, const ISD::InputArg *Arg = nullptr) const; + /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; - SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const; + SDValue getSegmentAperture(unsigned AS, const SDLoc &DL, + SelectionDAG &DAG) const; + SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; @@ -79,13 +93,24 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1) const; + SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1, bool Signed) const; SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; + SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -94,7 +119,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { bool isLegalFlatAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; - bool isCFIntrinsic(const SDNode *Intr) const; + unsigned isCFIntrinsic(const SDNode *Intr) const; void createDebuggerPrologueStackObjects(MachineFunction &MF) const; @@ -115,15 +140,22 @@ public: const SISubtarget *getSubtarget() const; + bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, + EVT /*VT*/) const override; + bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, unsigned IntrinsicID) const override; - bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, - EVT /*VT*/) const override; + bool getAddrModeArguments(IntrinsicInst * /*I*/, + SmallVectorImpl<Value*> &/*Ops*/, + Type *&/*AccessTy*/) const override; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; + bool canMergeStoresTo(unsigned AS, EVT MemVT, + const SelectionDAG &DAG) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *IsFast) const override; @@ -155,7 +187,12 @@ public: const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override; - SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + bool CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const override; + + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; @@ -175,14 +212,15 @@ public: MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override; - SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const override; - void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; + SDNode *legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const; @@ -194,6 +232,8 @@ public: ConstraintType getConstraintType(StringRef Constraint) const override; SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const; + + void finalizeLowering(MachineFunction &MF) const override; }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index 91e4bf7..ba346d2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -1,4 +1,4 @@ -//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===// +//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===// // // The LLVM Compiler Infrastructure // @@ -12,33 +12,46 @@ /// branches when it's expected that jumping over the untaken control flow will /// be cheaper than having every workitem no-op through it. // +//===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetMachine.h" +#include <cassert> +#include <cstdint> +#include <iterator> using namespace llvm; #define DEBUG_TYPE "si-insert-skips" -namespace { - static cl::opt<unsigned> SkipThresholdFlag( "amdgpu-skip-threshold", cl::desc("Number of instructions before jumping over divergent control flow"), cl::init(12), cl::Hidden); +namespace { + class SIInsertSkips : public MachineFunctionPass { private: - const SIRegisterInfo *TRI; - const SIInstrInfo *TII; - unsigned SkipThreshold; + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + unsigned SkipThreshold = 0; bool shouldSkip(const MachineBasicBlock &From, const MachineBasicBlock &To) const; @@ -55,8 +68,7 @@ private: public: static char ID; - SIInsertSkips() : - MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { } + SIInsertSkips() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -69,7 +81,7 @@ public: } }; -} // End anonymous namespace +} // end anonymous namespace char SIInsertSkips::ID = 0; @@ -195,8 +207,8 @@ void SIInsertSkips::kill(MachineInstr &MI) { } } else { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) - .addImm(0) - .addOperand(Op); + .addImm(0) + .add(Op); } } @@ -251,6 +263,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { BI != BE; BI = NextBB) { NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; + bool HaveSkipBlock = false; if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) { // Reached convergence point for last divergent branch. @@ -270,27 +283,33 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *I; switch (MI.getOpcode()) { - case AMDGPU::SI_MASK_BRANCH: { + case AMDGPU::SI_MASK_BRANCH: ExecBranchStack.push_back(MI.getOperand(0).getMBB()); MadeChange |= skipMaskBranch(MI, MBB); break; - } - case AMDGPU::S_BRANCH: { + + case AMDGPU::S_BRANCH: // Optimize out branches to the next block. // FIXME: Shouldn't this be handled by BranchFolding? - if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) + if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { + MI.eraseFromParent(); + } else if (HaveSkipBlock) { + // Remove the given unconditional branch when a skip block has been + // inserted after the current one and let skip the two instructions + // performing the kill if the exec mask is non-zero. MI.eraseFromParent(); + } break; - } - case AMDGPU::SI_KILL_TERMINATOR: { + + case AMDGPU::SI_KILL_TERMINATOR: MadeChange = true; kill(MI); if (ExecBranchStack.empty()) { if (skipIfDead(MI, *NextBB)) { + HaveSkipBlock = true; NextBB = std::next(BI); BE = MF.end(); - Next = MBB.end(); } } else { HaveKill = true; @@ -298,15 +317,15 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { MI.eraseFromParent(); break; - } - case AMDGPU::SI_RETURN: { + + case AMDGPU::SI_RETURN_TO_EPILOG: // FIXME: Should move somewhere else assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); // Graphics shaders returning non-void shouldn't contain S_ENDPGM, // because external bytecode will be appended at the end. if (BI != --MF.end() || I != MBB.getFirstTerminator()) { - // SI_RETURN is not the last instruction. Add an empty block at + // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at // the end and jump there. if (!EmptyMBBAtEnd) { EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); @@ -318,7 +337,8 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { .addMBB(EmptyMBBAtEnd); I->eraseFromParent(); } - } + break; + default: break; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp new file mode 100644 index 0000000..0f009a4 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -0,0 +1,1882 @@ +//===-- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===/ +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Insert wait instructions for memory reads and writes. +/// +/// Memory reads and writes are issued asynchronously, so we need to insert +/// S_WAITCNT instructions when we want to access any of their results or +/// overwrite any register that's used asynchronously. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +#define DEBUG_TYPE "si-insert-waitcnts" + +using namespace llvm; + +namespace { + +// Class of object that encapsulates latest instruction counter score +// associated with the operand. Used for determining whether +// s_waitcnt instruction needs to be emited. + +#define CNT_MASK(t) (1u << (t)) + +enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS }; + +typedef std::pair<signed, signed> RegInterval; + +struct { + int32_t VmcntMax; + int32_t ExpcntMax; + int32_t LgkmcntMax; + int32_t NumVGPRsMax; + int32_t NumSGPRsMax; +} HardwareLimits; + +struct { + unsigned VGPR0; + unsigned VGPRL; + unsigned SGPR0; + unsigned SGPRL; +} RegisterEncoding; + +enum WaitEventType { + VMEM_ACCESS, // vector-memory read & write + LDS_ACCESS, // lds read & write + GDS_ACCESS, // gds read & write + SQ_MESSAGE, // send message + SMEM_ACCESS, // scalar-memory read & write + EXP_GPR_LOCK, // export holding on its data src + GDS_GPR_LOCK, // GDS holding on its data and addr src + EXP_POS_ACCESS, // write to export position + EXP_PARAM_ACCESS, // write to export parameter + VMW_GPR_LOCK, // vector-memory write holding on its data src + NUM_WAIT_EVENTS, +}; + +// The mapping is: +// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs +// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots +// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs +// We reserve a fixed number of VGPR slots in the scoring tables for +// special tokens like SCMEM_LDS (needed for buffer load to LDS). +enum RegisterMapping { + SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets. + SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. + NUM_EXTRA_VGPRS = 1, // A reserved slot for DS. + EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses. + NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. +}; + +#define ForAllWaitEventType(w) \ + for (enum WaitEventType w = (enum WaitEventType)0; \ + (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \ + (w) = (enum WaitEventType)((w) + 1)) + +// This is a per-basic-block object that maintains current score brackets +// of each wait-counter, and a per-register scoreboard for each wait-couner. +// We also maintain the latest score for every event type that can change the +// waitcnt in order to know if there are multiple types of events within +// the brackets. When multiple types of event happen in the bracket, +// wait-count may get decreased out of order, therefore we need to put in +// "s_waitcnt 0" before use. +class BlockWaitcntBrackets { +public: + static int32_t getWaitCountMax(InstCounterType T) { + switch (T) { + case VM_CNT: + return HardwareLimits.VmcntMax; + case LGKM_CNT: + return HardwareLimits.LgkmcntMax; + case EXP_CNT: + return HardwareLimits.ExpcntMax; + default: + break; + } + return 0; + }; + + void setScoreLB(InstCounterType T, int32_t Val) { + assert(T < NUM_INST_CNTS); + if (T >= NUM_INST_CNTS) + return; + ScoreLBs[T] = Val; + }; + + void setScoreUB(InstCounterType T, int32_t Val) { + assert(T < NUM_INST_CNTS); + if (T >= NUM_INST_CNTS) + return; + ScoreUBs[T] = Val; + if (T == EXP_CNT) { + int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT)); + if (ScoreLBs[T] < UB) + ScoreLBs[T] = UB; + } + }; + + int32_t getScoreLB(InstCounterType T) { + assert(T < NUM_INST_CNTS); + if (T >= NUM_INST_CNTS) + return 0; + return ScoreLBs[T]; + }; + + int32_t getScoreUB(InstCounterType T) { + assert(T < NUM_INST_CNTS); + if (T >= NUM_INST_CNTS) + return 0; + return ScoreUBs[T]; + }; + + // Mapping from event to counter. + InstCounterType eventCounter(WaitEventType E) { + switch (E) { + case VMEM_ACCESS: + return VM_CNT; + case LDS_ACCESS: + case GDS_ACCESS: + case SQ_MESSAGE: + case SMEM_ACCESS: + return LGKM_CNT; + case EXP_GPR_LOCK: + case GDS_GPR_LOCK: + case VMW_GPR_LOCK: + case EXP_POS_ACCESS: + case EXP_PARAM_ACCESS: + return EXP_CNT; + default: + llvm_unreachable("unhandled event type"); + } + return NUM_INST_CNTS; + } + + void setRegScore(int GprNo, InstCounterType T, int32_t Val) { + if (GprNo < NUM_ALL_VGPRS) { + if (GprNo > VgprUB) { + VgprUB = GprNo; + } + VgprScores[T][GprNo] = Val; + } else { + assert(T == LGKM_CNT); + if (GprNo - NUM_ALL_VGPRS > SgprUB) { + SgprUB = GprNo - NUM_ALL_VGPRS; + } + SgprScores[GprNo - NUM_ALL_VGPRS] = Val; + } + } + + int32_t getRegScore(int GprNo, InstCounterType T) { + if (GprNo < NUM_ALL_VGPRS) { + return VgprScores[T][GprNo]; + } + return SgprScores[GprNo - NUM_ALL_VGPRS]; + } + + void clear() { + memset(ScoreLBs, 0, sizeof(ScoreLBs)); + memset(ScoreUBs, 0, sizeof(ScoreUBs)); + memset(EventUBs, 0, sizeof(EventUBs)); + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + memset(VgprScores[T], 0, sizeof(VgprScores[T])); + } + memset(SgprScores, 0, sizeof(SgprScores)); + } + + RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, + const MachineRegisterInfo *MRI, + const SIRegisterInfo *TRI, unsigned OpNo, + bool Def) const; + + void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, + const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, + unsigned OpNo, int32_t Val); + + void setWaitAtBeginning() { WaitAtBeginning = true; } + void clearWaitAtBeginning() { WaitAtBeginning = false; } + bool getWaitAtBeginning() const { return WaitAtBeginning; } + void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; } + int32_t getMaxVGPR() const { return VgprUB; } + int32_t getMaxSGPR() const { return SgprUB; } + int32_t getEventUB(enum WaitEventType W) const { + assert(W < NUM_WAIT_EVENTS); + return EventUBs[W]; + } + bool counterOutOfOrder(InstCounterType T); + unsigned int updateByWait(InstCounterType T, int ScoreToWait); + void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, WaitEventType E, + MachineInstr &MI); + + BlockWaitcntBrackets() + : WaitAtBeginning(false), RevisitLoop(false), ValidLoop(false), MixedExpTypes(false), + LoopRegion(NULL), PostOrder(0), Waitcnt(NULL), VgprUB(0), SgprUB(0) { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + memset(VgprScores[T], 0, sizeof(VgprScores[T])); + } + } + ~BlockWaitcntBrackets(){}; + + bool hasPendingSMEM() const { + return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && + EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]); + } + + bool hasPendingFlat() const { + return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] && + LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) || + (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] && + LastFlat[VM_CNT] <= ScoreUBs[VM_CNT])); + } + + void setPendingFlat() { + LastFlat[VM_CNT] = ScoreUBs[VM_CNT]; + LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; + } + + int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; } + + void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; } + + bool getRevisitLoop() const { return RevisitLoop; } + void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; } + + void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; } + int32_t getPostOrder() const { return PostOrder; } + + void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; } + void clearWaitcnt() { Waitcnt = NULL; } + MachineInstr *getWaitcnt() const { return Waitcnt; } + + bool mixedExpTypes() const { return MixedExpTypes; } + void setMixedExpTypes(bool MixedExpTypesIn) { + MixedExpTypes = MixedExpTypesIn; + } + + void print(raw_ostream &); + void dump() { print(dbgs()); } + +private: + bool WaitAtBeginning; + bool RevisitLoop; + bool ValidLoop; + bool MixedExpTypes; + MachineLoop *LoopRegion; + int32_t PostOrder; + MachineInstr *Waitcnt; + int32_t ScoreLBs[NUM_INST_CNTS] = {0}; + int32_t ScoreUBs[NUM_INST_CNTS] = {0}; + int32_t EventUBs[NUM_WAIT_EVENTS] = {0}; + // Remember the last flat memory operation. + int32_t LastFlat[NUM_INST_CNTS] = {0}; + // wait_cnt scores for every vgpr. + // Keep track of the VgprUB and SgprUB to make merge at join efficient. + int32_t VgprUB; + int32_t SgprUB; + int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS]; + // Wait cnt scores for every sgpr, only lgkmcnt is relevant. + int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0}; +}; + +// This is a per-loop-region object that records waitcnt status at the end of +// loop footer from the previous iteration. We also maintain an iteration +// count to track the number of times the loop has been visited. When it +// doesn't converge naturally, we force convergence by inserting s_waitcnt 0 +// at the end of the loop footer. +class LoopWaitcntData { +public: + void incIterCnt() { IterCnt++; } + void resetIterCnt() { IterCnt = 0; } + int32_t getIterCnt() { return IterCnt; } + + LoopWaitcntData() : LfWaitcnt(NULL), IterCnt(0) {} + ~LoopWaitcntData(){}; + + void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; } + MachineInstr *getWaitcnt() const { return LfWaitcnt; } + + void print() { + DEBUG(dbgs() << " iteration " << IterCnt << '\n';); + return; + } + +private: + // s_waitcnt added at the end of loop footer to stablize wait scores + // at the end of the loop footer. + MachineInstr *LfWaitcnt; + // Number of iterations the loop has been visited, not including the initial + // walk over. + int32_t IterCnt; +}; + +class SIInsertWaitcnts : public MachineFunctionPass { + +private: + const SISubtarget *ST; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + const MachineLoopInfo *MLI; + AMDGPU::IsaInfo::IsaVersion IV; + AMDGPUAS AMDGPUASI; + + DenseSet<MachineBasicBlock *> BlockVisitedSet; + DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet; + DenseSet<MachineInstr *> VCCZBugHandledSet; + + DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>> + BlockWaitcntBracketsMap; + + DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet; + + DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap; + + std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets; + +public: + static char ID; + + SIInsertWaitcnts() + : MachineFunctionPass(ID), ST(nullptr), TII(nullptr), TRI(nullptr), + MRI(nullptr), MLI(nullptr) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI insert wait instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + void addKillWaitBracket(BlockWaitcntBrackets *Bracket) { + // The waitcnt information is copied because it changes as the block is + // traversed. + KillWaitBrackets.push_back(make_unique<BlockWaitcntBrackets>(*Bracket)); + } + + MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI, + BlockWaitcntBrackets *ScoreBrackets); + void updateEventWaitCntAfter(MachineInstr &Inst, + BlockWaitcntBrackets *ScoreBrackets); + void mergeInputScoreBrackets(MachineBasicBlock &Block); + MachineBasicBlock *loopBottom(const MachineLoop *Loop); + void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block); + void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst); +}; + +} // End anonymous namespace. + +RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI, + const SIInstrInfo *TII, + const MachineRegisterInfo *MRI, + const SIRegisterInfo *TRI, + unsigned OpNo, + bool Def) const { + const MachineOperand &Op = MI->getOperand(OpNo); + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) || + (Def && !Op.isDef())) + return {-1, -1}; + + // A use via a PW operand does not need a waitcnt. + // A partial write is not a WAW. + assert(!Op.getSubReg() || !Op.isUndef()); + + RegInterval Result; + const MachineRegisterInfo &MRIA = *MRI; + + unsigned Reg = TRI->getEncodingValue(Op.getReg()); + + if (TRI->isVGPR(MRIA, Op.getReg())) { + assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL); + Result.first = Reg - RegisterEncoding.VGPR0; + assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); + } else if (TRI->isSGPRReg(MRIA, Op.getReg())) { + assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); + Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS; + assert(Result.first >= NUM_ALL_VGPRS && + Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); + } + // TODO: Handle TTMP + // else if (TRI->isTTMP(MRIA, Reg.getReg())) ... + else + return {-1, -1}; + + const MachineInstr &MIA = *MI; + const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo); + unsigned Size = TRI->getRegSizeInBits(*RC); + Result.second = Result.first + (Size / 32); + + return Result; +} + +void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, + unsigned OpNo, int32_t Val) { + RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false); + DEBUG({ + const MachineOperand &Opnd = MI->getOperand(OpNo); + assert(TRI->isVGPR(*MRI, Opnd.getReg())); + }); + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + setRegScore(RegNo, EXP_CNT, Val); + } +} + +void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, + WaitEventType E, MachineInstr &Inst) { + const MachineRegisterInfo &MRIA = *MRI; + InstCounterType T = eventCounter(E); + int32_t CurrScore = getScoreUB(T) + 1; + // EventUB and ScoreUB need to be update regardless if this event changes + // the score of a register or not. + // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. + EventUBs[E] = CurrScore; + setScoreUB(T, CurrScore); + + if (T == EXP_CNT) { + // Check for mixed export types. If they are mixed, then a waitcnt exp(0) + // is required. + if (!MixedExpTypes) { + MixedExpTypes = counterOutOfOrder(EXP_CNT); + } + + // Put score on the source vgprs. If this is a store, just use those + // specific register(s). + if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { + // All GDS operations must protect their address register (same as + // export.) + if (Inst.getOpcode() != AMDGPU::DS_APPEND && + Inst.getOpcode() != AMDGPU::DS_CONSUME) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr), + CurrScore); + } + if (Inst.mayStore()) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), + CurrScore); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), + AMDGPU::OpName::data1) != -1) { + setExpScore(&Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), + AMDGPU::OpName::data1), + CurrScore); + } + } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 && + Inst.getOpcode() != AMDGPU::DS_GWS_INIT && + Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V && + Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR && + Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P && + Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER && + Inst.getOpcode() != AMDGPU::DS_APPEND && + Inst.getOpcode() != AMDGPU::DS_CONSUME && + Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { + for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { + const MachineOperand &Op = Inst.getOperand(I); + if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) { + setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); + } + } + } + } else if (TII->isFLAT(Inst)) { + if (Inst.mayStore()) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), + CurrScore); + } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), + CurrScore); + } + } else if (TII->isMIMG(Inst)) { + if (Inst.mayStore()) { + setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); + } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), + CurrScore); + } + } else if (TII->isMTBUF(Inst)) { + if (Inst.mayStore()) { + setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); + } + } else if (TII->isMUBUF(Inst)) { + if (Inst.mayStore()) { + setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); + } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), + CurrScore); + } + } else { + if (TII->isEXP(Inst)) { + // For export the destination registers are really temps that + // can be used as the actual source after export patching, so + // we need to treat them like sources and set the EXP_CNT + // score. + for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { + MachineOperand &DefMO = Inst.getOperand(I); + if (DefMO.isReg() && DefMO.isDef() && + TRI->isVGPR(MRIA, DefMO.getReg())) { + setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT, + CurrScore); + } + } + } + for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { + MachineOperand &MO = Inst.getOperand(I); + if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) { + setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); + } + } + } +#if 0 // TODO: check if this is handled by MUBUF code above. + } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD || + Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 || + Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { + MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); + unsigned OpNo;//TODO: find the OpNo for this operand; + RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false); + for (signed RegNo = Interval.first; RegNo < Interval.second; + ++RegNo) { + setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); + } +#endif + } else { + // Match the score to the destination registers. + for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { + RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true); + if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS) + continue; + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + setRegScore(RegNo, T, CurrScore); + } + } + if (TII->isDS(Inst) && Inst.mayStore()) { + setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); + } + } +} + +void BlockWaitcntBrackets::print(raw_ostream &OS) { + OS << '\n'; + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + int LB = getScoreLB(T); + int UB = getScoreUB(T); + + switch (T) { + case VM_CNT: + OS << " VM_CNT(" << UB - LB << "): "; + break; + case LGKM_CNT: + OS << " LGKM_CNT(" << UB - LB << "): "; + break; + case EXP_CNT: + OS << " EXP_CNT(" << UB - LB << "): "; + break; + default: + OS << " UNKNOWN(" << UB - LB << "): "; + break; + } + + if (LB < UB) { + // Print vgpr scores. + for (int J = 0; J <= getMaxVGPR(); J++) { + int RegScore = getRegScore(J, T); + if (RegScore <= LB) + continue; + int RelScore = RegScore - LB - 1; + if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { + OS << RelScore << ":v" << J << " "; + } else { + OS << RelScore << ":ds "; + } + } + // Also need to print sgpr scores for lgkm_cnt. + if (T == LGKM_CNT) { + for (int J = 0; J <= getMaxSGPR(); J++) { + int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); + if (RegScore <= LB) + continue; + int RelScore = RegScore - LB - 1; + OS << RelScore << ":s" << J << " "; + } + } + } + OS << '\n'; + } + OS << '\n'; + return; +} + +unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T, + int ScoreToWait) { + unsigned int NeedWait = 0; + if (ScoreToWait == -1) { + // The score to wait is unknown. This implies that it was not encountered + // during the path of the CFG walk done during the current traversal but + // may be seen on a different path. Emit an s_wait counter with a + // conservative value of 0 for the counter. + NeedWait = CNT_MASK(T); + setScoreLB(T, getScoreUB(T)); + return NeedWait; + } + + // If the score of src_operand falls within the bracket, we need an + // s_waitcnt instruction. + const int32_t LB = getScoreLB(T); + const int32_t UB = getScoreUB(T); + if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { + if (T == VM_CNT && hasPendingFlat()) { + // If there is a pending FLAT operation, and this is a VM waitcnt, + // then we need to force a waitcnt 0 for VM. + NeedWait = CNT_MASK(T); + setScoreLB(T, getScoreUB(T)); + } else if (counterOutOfOrder(T)) { + // Counter can get decremented out-of-order when there + // are multiple types event in the brack. Also emit an s_wait counter + // with a conservative value of 0 for the counter. + NeedWait = CNT_MASK(T); + setScoreLB(T, getScoreUB(T)); + } else { + NeedWait = CNT_MASK(T); + setScoreLB(T, ScoreToWait); + } + } + + return NeedWait; +} + +// Where there are multiple types of event in the bracket of a counter, +// the decrement may go out of order. +bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) { + switch (T) { + case VM_CNT: + return false; + case LGKM_CNT: { + if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && + EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) { + // Scalar memory read always can go out of order. + return true; + } + int NumEventTypes = 0; + if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] && + EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { + NumEventTypes++; + } + if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] && + EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { + NumEventTypes++; + } + if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] && + EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) { + NumEventTypes++; + } + if (NumEventTypes <= 1) { + return false; + } + break; + } + case EXP_CNT: { + // If there has been a mixture of export types, then a waitcnt exp(0) is + // required. + if (MixedExpTypes) + return true; + int NumEventTypes = 0; + if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] && + EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { + NumEventTypes++; + } + if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] && + EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { + NumEventTypes++; + } + if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] && + EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { + NumEventTypes++; + } + if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] && + EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) { + NumEventTypes++; + } + + if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] && + EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) { + NumEventTypes++; + } + + if (NumEventTypes <= 1) { + return false; + } + break; + } + default: + break; + } + return true; +} + +INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, + false) +INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, + false) + +char SIInsertWaitcnts::ID = 0; + +char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; + +FunctionPass *llvm::createSIInsertWaitcntsPass() { + return new SIInsertWaitcnts(); +} + +static bool readsVCCZ(const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && + !MI.getOperand(1).isUndef(); +} + +/// \brief Generate s_waitcnt instruction to be placed before cur_Inst. +/// Instructions of a given type are returned in order, +/// but instructions of different types can complete out of order. +/// We rely on this in-order completion +/// and simply assign a score to the memory access instructions. +/// We keep track of the active "score bracket" to determine +/// if an access of a memory read requires an s_waitcnt +/// and if so what the value of each counter is. +/// The "score bracket" is bound by the lower bound and upper bound +/// scores (*_score_LB and *_score_ub respectively). +MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( + MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) { + // To emit, or not to emit - that's the question! + // Start with an assumption that there is no need to emit. + unsigned int EmitSwaitcnt = 0; + // s_waitcnt instruction to return; default is NULL. + MachineInstr *SWaitInst = nullptr; + // No need to wait before phi. If a phi-move exists, then the wait should + // has been inserted before the move. If a phi-move does not exist, then + // wait should be inserted before the real use. The same is true for + // sc-merge. It is not a coincident that all these cases correspond to the + // instructions that are skipped in the assembling loop. + bool NeedLineMapping = false; // TODO: Check on this. + if (MI.isDebugValue() && + // TODO: any other opcode? + !NeedLineMapping) { + return SWaitInst; + } + + // See if an s_waitcnt is forced at block entry, or is needed at + // program end. + if (ScoreBrackets->getWaitAtBeginning()) { + // Note that we have already cleared the state, so we don't need to update + // it. + ScoreBrackets->clearWaitAtBeginning(); + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + EmitSwaitcnt |= CNT_MASK(T); + ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); + } + } + + // See if this instruction has a forced S_WAITCNT VM. + // TODO: Handle other cases of NeedsWaitcntVmBefore() + else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || + MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || + MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { + EmitSwaitcnt |= + ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); + } + + // All waits must be resolved at call return. + // NOTE: this could be improved with knowledge of all call sites or + // with knowledge of the called routines. + if (MI.getOpcode() == AMDGPU::RETURN || + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { + ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); + EmitSwaitcnt |= CNT_MASK(T); + } + } + } + // Resolve vm waits before gs-done. + else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || + MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && + ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) == + AMDGPU::SendMsg::ID_GS_DONE)) { + if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) { + ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); + EmitSwaitcnt |= CNT_MASK(VM_CNT); + } + } +#if 0 // TODO: the following blocks of logic when we have fence. + else if (MI.getOpcode() == SC_FENCE) { + const unsigned int group_size = + context->shader_info->GetMaxThreadGroupSize(); + // group_size == 0 means thread group size is unknown at compile time + const bool group_is_multi_wave = + (group_size == 0 || group_size > target_info->GetWaveFrontSize()); + const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence(); + + for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) { + SCRegType src_type = Inst->GetSrcType(i); + switch (src_type) { + case SCMEM_LDS: + if (group_is_multi_wave || + context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { + EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, + ScoreBrackets->getScoreUB(LGKM_CNT)); + // LDS may have to wait for VM_CNT after buffer load to LDS + if (target_info->HasBufferLoadToLDS()) { + EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, + ScoreBrackets->getScoreUB(VM_CNT)); + } + } + break; + + case SCMEM_GDS: + if (group_is_multi_wave || fence_is_global) { + EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, + ScoreBrackets->getScoreUB(EXP_CNT)); + EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, + ScoreBrackets->getScoreUB(LGKM_CNT)); + } + break; + + case SCMEM_UAV: + case SCMEM_TFBUF: + case SCMEM_RING: + case SCMEM_SCATTER: + if (group_is_multi_wave || fence_is_global) { + EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, + ScoreBrackets->getScoreUB(EXP_CNT)); + EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, + ScoreBrackets->getScoreUB(VM_CNT)); + } + break; + + case SCMEM_SCRATCH: + default: + break; + } + } + } +#endif + + // Export & GDS instructions do not read the EXEC mask until after the export + // is granted (which can occur well after the instruction is issued). + // The shader program must flush all EXP operations on the export-count + // before overwriting the EXEC mask. + else { + if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { + // Export and GDS are tracked individually, either may trigger a waitcnt + // for EXEC. + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK)); + } + +#if 0 // TODO: the following code to handle CALL. + // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT. + // However, there is a problem with EXP_CNT, because the call cannot + // easily tell if a register is used in the function, and if it did, then + // the referring instruction would have to have an S_WAITCNT, which is + // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs + // before the call. + if (MI.getOpcode() == SC_CALL) { + if (ScoreBrackets->getScoreUB(EXP_CNT) > + ScoreBrackets->getScoreLB(EXP_CNT)) { + ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); + EmitSwaitcnt |= CNT_MASK(EXP_CNT); + } + } +#endif + + // Look at the source operands of every instruction to see if + // any of them results from a previous memory operation that affects + // its current usage. If so, an s_waitcnt instruction needs to be + // emitted. + // If the source operand was defined by a load, add the s_waitcnt + // instruction. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS != AMDGPUASI.LOCAL_ADDRESS) + continue; + unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; + // VM_CNT is only relevant to vgpr or LDS. + EmitSwaitcnt |= ScoreBrackets->updateByWait( + VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + } + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + const MachineOperand &Op = MI.getOperand(I); + const MachineRegisterInfo &MRIA = *MRI; + RegInterval Interval = + ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false); + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (TRI->isVGPR(MRIA, Op.getReg())) { + // VM_CNT is only relevant to vgpr or LDS. + EmitSwaitcnt |= ScoreBrackets->updateByWait( + VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + } + EmitSwaitcnt |= ScoreBrackets->updateByWait( + LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); + } + } + // End of for loop that looks at all source operands to decide vm_wait_cnt + // and lgk_wait_cnt. + + // Two cases are handled for destination operands: + // 1) If the destination operand was defined by a load, add the s_waitcnt + // instruction to guarantee the right WAW order. + // 2) If a destination operand that was used by a recent export/store ins, + // add s_waitcnt on exp_cnt to guarantee the WAR order. + if (MI.mayStore()) { + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS != AMDGPUASI.LOCAL_ADDRESS) + continue; + unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; + EmitSwaitcnt |= ScoreBrackets->updateByWait( + VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); + } + } + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + MachineOperand &Def = MI.getOperand(I); + const MachineRegisterInfo &MRIA = *MRI; + RegInterval Interval = + ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true); + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (TRI->isVGPR(MRIA, Def.getReg())) { + EmitSwaitcnt |= ScoreBrackets->updateByWait( + VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); + } + EmitSwaitcnt |= ScoreBrackets->updateByWait( + LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); + } + } // End of for loop that looks at all dest operands. + } + + // TODO: Tie force zero to a compiler triage option. + bool ForceZero = false; + + // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 + // occurs before the instruction. Doing it here prevents any additional + // S_WAITCNTs from being emitted if the instruction was marked as + // requiring a WAITCNT beforehand. + if (MI.getOpcode() == AMDGPU::S_BARRIER && + !ST->hasAutoWaitcntBeforeBarrier()) { + EmitSwaitcnt |= + ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); + } + + // TODO: Remove this work-around, enable the assert for Bug 457939 + // after fixing the scheduler. Also, the Shader Compiler code is + // independent of target. + if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { + if (ScoreBrackets->getScoreLB(LGKM_CNT) < + ScoreBrackets->getScoreUB(LGKM_CNT) && + ScoreBrackets->hasPendingSMEM()) { + // Wait on everything, not just LGKM. vccz reads usually come from + // terminators, and we always wait on everything at the end of the + // block, so if we only wait on LGKM here, we might end up with + // another s_waitcnt inserted right after this if there are non-LGKM + // instructions still outstanding. + ForceZero = true; + EmitSwaitcnt = true; + } + } + + // Does this operand processing indicate s_wait counter update? + if (EmitSwaitcnt) { + int CntVal[NUM_INST_CNTS]; + + bool UseDefaultWaitcntStrategy = true; + if (ForceZero) { + // Force all waitcnts to 0. + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); + } + CntVal[VM_CNT] = 0; + CntVal[EXP_CNT] = 0; + CntVal[LGKM_CNT] = 0; + UseDefaultWaitcntStrategy = false; + } + + if (UseDefaultWaitcntStrategy) { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + if (EmitSwaitcnt & CNT_MASK(T)) { + int Delta = + ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T); + int MaxDelta = ScoreBrackets->getWaitCountMax(T); + if (Delta >= MaxDelta) { + Delta = -1; + if (T != EXP_CNT) { + ScoreBrackets->setScoreLB( + T, ScoreBrackets->getScoreUB(T) - MaxDelta); + } + EmitSwaitcnt &= ~CNT_MASK(T); + } + CntVal[T] = Delta; + } else { + // If we are not waiting for a particular counter then encode + // it as -1 which means "don't care." + CntVal[T] = -1; + } + } + } + + // If we are not waiting on any counter we can skip the wait altogether. + if (EmitSwaitcnt != 0) { + MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt(); + int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm(); + if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) != + (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) || + (AMDGPU::decodeExpcnt(IV, Imm) != + (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) || + (AMDGPU::decodeLgkmcnt(IV, Imm) != + (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) { + MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent()); + if (ContainingLoop) { + MachineBasicBlock *TBB = ContainingLoop->getHeader(); + BlockWaitcntBrackets *ScoreBracket = + BlockWaitcntBracketsMap[TBB].get(); + if (!ScoreBracket) { + assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end()); + BlockWaitcntBracketsMap[TBB] = make_unique<BlockWaitcntBrackets>(); + ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); + } + ScoreBracket->setRevisitLoop(true); + DEBUG(dbgs() << "set-revisit: block" + << ContainingLoop->getHeader()->getNumber() << '\n';); + } + } + + // Update an existing waitcount, or make a new one. + MachineFunction &MF = *MI.getParent()->getParent(); + if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) { + SWaitInst = OldWaitcnt; + } else { + SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT), + MI.getDebugLoc()); + CompilerGeneratedWaitcntSet.insert(SWaitInst); + } + + const MachineOperand &Op = + MachineOperand::CreateImm(AMDGPU::encodeWaitcnt( + IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT])); + SWaitInst->addOperand(MF, Op); + + if (CntVal[EXP_CNT] == 0) { + ScoreBrackets->setMixedExpTypes(false); + } + } + } + + return SWaitInst; +} + +void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB, + MachineInstr *Waitcnt) { + if (MBB.empty()) { + MBB.push_back(Waitcnt); + return; + } + + MachineBasicBlock::iterator It = MBB.end(); + MachineInstr *MI = &*(--It); + if (MI->isBranch()) { + MBB.insert(It, Waitcnt); + } else { + MBB.push_back(Waitcnt); + } + + return; +} + +void SIInsertWaitcnts::updateEventWaitCntAfter( + MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) { + // Now look at the instruction opcode. If it is a memory access + // instruction, update the upper-bound of the appropriate counter's + // bracket and the destination operand scores. + // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. + uint64_t TSFlags = Inst.getDesc().TSFlags; + if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) { + if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) && + TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { + ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); + ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); + } else { + ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); + } + } else if (TII->isFLAT(Inst)) { + assert(Inst.mayLoad() || Inst.mayStore()); + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); + + // This is a flat memory operation. Check to see if it has memory + // tokens for both LDS and Memory, and if so mark it as a flat. + bool FoundLDSMem = false; + for (const MachineMemOperand *Memop : Inst.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) + FoundLDSMem = true; + } + + // This is a flat memory operation, so note it - it will require + // that both the VM and LGKM be flushed to zero if it is pending when + // a VM or LGKM dependency occurs. + if (FoundLDSMem) { + ScoreBrackets->setPendingFlat(); + } + } else if (SIInstrInfo::isVMEM(Inst) && + // TODO: get a better carve out. + Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 && + Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC && + Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) { + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() && + (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) { + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); + } + } else if (TII->isSMRD(Inst)) { + ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); + } else { + switch (Inst.getOpcode()) { + case AMDGPU::S_SENDMSG: + case AMDGPU::S_SENDMSGHALT: + ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); + break; + case AMDGPU::EXP: + case AMDGPU::EXP_DONE: { + int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); + if (Imm >= 32 && Imm <= 63) + ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst); + else if (Imm >= 12 && Imm <= 15) + ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); + else + ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); + break; + } + case AMDGPU::S_MEMTIME: + case AMDGPU::S_MEMREALTIME: + ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); + break; + default: + break; + } + } +} + +void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { + BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); + int32_t MaxPending[NUM_INST_CNTS] = {0}; + int32_t MaxFlat[NUM_INST_CNTS] = {0}; + bool MixedExpTypes = false; + + // Clear the score bracket state. + ScoreBrackets->clear(); + + // Compute the number of pending elements on block entry. + + // IMPORTANT NOTE: If iterative handling of loops is added, the code will + // need to handle single BBs with backedges to themselves. This means that + // they will need to retain and not clear their initial state. + + // See if there are any uninitialized predecessors. If so, emit an + // s_waitcnt 0 at the beginning of the block. + for (MachineBasicBlock *pred : Block.predecessors()) { + BlockWaitcntBrackets *PredScoreBrackets = + BlockWaitcntBracketsMap[pred].get(); + bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end(); + if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { + break; + } + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + int span = + PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T); + MaxPending[T] = std::max(MaxPending[T], span); + span = + PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T); + MaxFlat[T] = std::max(MaxFlat[T], span); + } + + MixedExpTypes |= PredScoreBrackets->mixedExpTypes(); + } + + // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? + // Also handle kills for exit block. + if (Block.succ_empty() && !KillWaitBrackets.empty()) { + for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + int Span = KillWaitBrackets[I]->getScoreUB(T) - + KillWaitBrackets[I]->getScoreLB(T); + MaxPending[T] = std::max(MaxPending[T], Span); + Span = KillWaitBrackets[I]->pendingFlat(T) - + KillWaitBrackets[I]->getScoreLB(T); + MaxFlat[T] = std::max(MaxFlat[T], Span); + } + + MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes(); + } + } + + // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK. + for (MachineBasicBlock *Pred : Block.predecessors()) { + BlockWaitcntBrackets *PredScoreBrackets = + BlockWaitcntBracketsMap[Pred].get(); + bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end(); + if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { + break; + } + + int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) - + PredScoreBrackets->getScoreLB(EXP_CNT); + MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan); + int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) - + PredScoreBrackets->getScoreLB(EXP_CNT); + MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan); + } + + // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? + if (Block.succ_empty() && !KillWaitBrackets.empty()) { + for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { + int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) - + KillWaitBrackets[I]->getScoreLB(EXP_CNT); + MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan); + int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) - + KillWaitBrackets[I]->getScoreLB(EXP_CNT); + MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan); + } + } + +#if 0 + // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker. + // TODO: how does LC distinguish between function entry and main entry? + // If this is the entry to a function, force a wait. + MachineBasicBlock &Entry = Block.getParent()->front(); + if (Entry.getNumber() == Block.getNumber()) { + ScoreBrackets->setWaitAtBeginning(); + return; + } +#endif + + // Now set the current Block's brackets to the largest ending bracket. + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + ScoreBrackets->setScoreUB(T, MaxPending[T]); + ScoreBrackets->setScoreLB(T, 0); + ScoreBrackets->setLastFlat(T, MaxFlat[T]); + } + + ScoreBrackets->setMixedExpTypes(MixedExpTypes); + + // Set the register scoreboard. + for (MachineBasicBlock *Pred : Block.predecessors()) { + if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { + break; + } + + BlockWaitcntBrackets *PredScoreBrackets = + BlockWaitcntBracketsMap[Pred].get(); + + // Now merge the gpr_reg_score information + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + int PredLB = PredScoreBrackets->getScoreLB(T); + int PredUB = PredScoreBrackets->getScoreUB(T); + if (PredLB < PredUB) { + int PredScale = MaxPending[T] - PredUB; + // Merge vgpr scores. + for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) { + int PredRegScore = PredScoreBrackets->getRegScore(J, T); + if (PredRegScore <= PredLB) + continue; + int NewRegScore = PredScale + PredRegScore; + ScoreBrackets->setRegScore( + J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore)); + } + // Also need to merge sgpr scores for lgkm_cnt. + if (T == LGKM_CNT) { + for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) { + int PredRegScore = + PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); + if (PredRegScore <= PredLB) + continue; + int NewRegScore = PredScale + PredRegScore; + ScoreBrackets->setRegScore( + J + NUM_ALL_VGPRS, LGKM_CNT, + std::max( + ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT), + NewRegScore)); + } + } + } + } + + // Also merge the WaitEvent information. + ForAllWaitEventType(W) { + enum InstCounterType T = PredScoreBrackets->eventCounter(W); + int PredEventUB = PredScoreBrackets->getEventUB(W); + if (PredEventUB > PredScoreBrackets->getScoreLB(T)) { + int NewEventUB = + MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T); + if (NewEventUB > 0) { + ScoreBrackets->setEventUB( + W, std::max(ScoreBrackets->getEventUB(W), NewEventUB)); + } + } + } + } + + // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? + // Set the register scoreboard. + if (Block.succ_empty() && !KillWaitBrackets.empty()) { + for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { + // Now merge the gpr_reg_score information. + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + int PredLB = KillWaitBrackets[I]->getScoreLB(T); + int PredUB = KillWaitBrackets[I]->getScoreUB(T); + if (PredLB < PredUB) { + int PredScale = MaxPending[T] - PredUB; + // Merge vgpr scores. + for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) { + int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T); + if (PredRegScore <= PredLB) + continue; + int NewRegScore = PredScale + PredRegScore; + ScoreBrackets->setRegScore( + J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore)); + } + // Also need to merge sgpr scores for lgkm_cnt. + if (T == LGKM_CNT) { + for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) { + int PredRegScore = + KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); + if (PredRegScore <= PredLB) + continue; + int NewRegScore = PredScale + PredRegScore; + ScoreBrackets->setRegScore( + J + NUM_ALL_VGPRS, LGKM_CNT, + std::max( + ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT), + NewRegScore)); + } + } + } + } + + // Also merge the WaitEvent information. + ForAllWaitEventType(W) { + enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W); + int PredEventUB = KillWaitBrackets[I]->getEventUB(W); + if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) { + int NewEventUB = + MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T); + if (NewEventUB > 0) { + ScoreBrackets->setEventUB( + W, std::max(ScoreBrackets->getEventUB(W), NewEventUB)); + } + } + } + } + } + + // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the + // sequencing predecessors, because changes to EXEC require waitcnts due to + // the delayed nature of these operations. + for (MachineBasicBlock *Pred : Block.predecessors()) { + if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { + break; + } + + BlockWaitcntBrackets *PredScoreBrackets = + BlockWaitcntBracketsMap[Pred].get(); + + int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK); + if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { + int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub - + PredScoreBrackets->getScoreUB(EXP_CNT); + if (new_gds_ub > 0) { + ScoreBrackets->setEventUB( + GDS_GPR_LOCK, + std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub)); + } + } + int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK); + if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { + int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub - + PredScoreBrackets->getScoreUB(EXP_CNT); + if (new_exp_ub > 0) { + ScoreBrackets->setEventUB( + EXP_GPR_LOCK, + std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub)); + } + } + } +} + +/// Return the "bottom" block of a loop. This differs from +/// MachineLoop::getBottomBlock in that it works even if the loop is +/// discontiguous. +MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) { + MachineBasicBlock *Bottom = Loop->getHeader(); + for (MachineBasicBlock *MBB : Loop->blocks()) + if (MBB->getNumber() > Bottom->getNumber()) + Bottom = MBB; + return Bottom; +} + +// Generate s_waitcnt instructions where needed. +void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, + MachineBasicBlock &Block) { + // Initialize the state information. + mergeInputScoreBrackets(Block); + + BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); + + DEBUG({ + dbgs() << "Block" << Block.getNumber(); + ScoreBrackets->dump(); + }); + + bool InsertNOP = false; + + // Walk over the instructions. + for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end(); + Iter != E;) { + MachineInstr &Inst = *Iter; + // Remove any previously existing waitcnts. + if (Inst.getOpcode() == AMDGPU::S_WAITCNT) { + // TODO: Register the old waitcnt and optimize the following waitcnts. + // Leaving the previously existing waitcnts is conservatively correct. + if (CompilerGeneratedWaitcntSet.find(&Inst) == + CompilerGeneratedWaitcntSet.end()) + ++Iter; + else { + ScoreBrackets->setWaitcnt(&Inst); + ++Iter; + Inst.removeFromParent(); + } + continue; + } + + // Kill instructions generate a conditional branch to the endmain block. + // Merge the current waitcnt state into the endmain block information. + // TODO: Are there other flavors of KILL instruction? + if (Inst.getOpcode() == AMDGPU::KILL) { + addKillWaitBracket(ScoreBrackets); + } + + bool VCCZBugWorkAround = false; + if (readsVCCZ(Inst) && + (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) { + if (ScoreBrackets->getScoreLB(LGKM_CNT) < + ScoreBrackets->getScoreUB(LGKM_CNT) && + ScoreBrackets->hasPendingSMEM()) { + if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) + VCCZBugWorkAround = true; + } + } + + // Generate an s_waitcnt instruction to be placed before + // cur_Inst, if needed. + MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets); + + if (SWaitInst) { + Block.insert(Inst, SWaitInst); + if (ScoreBrackets->getWaitcnt() != SWaitInst) { + DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "Old Instr: " << Inst << '\n' + << "New Instr: " << *SWaitInst << '\n';); + } + } + + updateEventWaitCntAfter(Inst, ScoreBrackets); + +#if 0 // TODO: implement resource type check controlled by options with ub = LB. + // If this instruction generates a S_SETVSKIP because it is an + // indexed resource, and we are on Tahiti, then it will also force + // an S_WAITCNT vmcnt(0) + if (RequireCheckResourceType(Inst, context)) { + // Force the score to as if an S_WAITCNT vmcnt(0) is emitted. + ScoreBrackets->setScoreLB(VM_CNT, + ScoreBrackets->getScoreUB(VM_CNT)); + } +#endif + + ScoreBrackets->clearWaitcnt(); + + if (SWaitInst) { + DEBUG({ SWaitInst->print(dbgs() << '\n'); }); + } + DEBUG({ + Inst.print(dbgs()); + ScoreBrackets->dump(); + }); + + // Check to see if this is a GWS instruction. If so, and if this is CI or + // VI, then the generated code sequence will include an S_WAITCNT 0. + // TODO: Are these the only GWS instructions? + if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT || + Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V || + Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || + Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P || + Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) { + // TODO: && context->target_info->GwsRequiresMemViolTest() ) { + ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); + ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); + ScoreBrackets->updateByWait(LGKM_CNT, + ScoreBrackets->getScoreUB(LGKM_CNT)); + } + + // TODO: Remove this work-around after fixing the scheduler and enable the + // assert above. + if (VCCZBugWorkAround) { + // Restore the vccz bit. Any time a value is written to vcc, the vcc + // bit is updated, so we can restore the bit by reading the value of + // vcc and then writing it back to the register. + BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), + AMDGPU::VCC) + .addReg(AMDGPU::VCC); + VCCZBugHandledSet.insert(&Inst); + } + + if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + + // This avoids a s_nop after a waitcnt has just been inserted. + if (!SWaitInst && InsertNOP) { + BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); + } + InsertNOP = false; + + // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM + // or SMEM clause, respectively. + // + // The temporary workaround is to break the clauses with S_NOP. + // + // The proper solution would be to allocate registers such that all source + // and destination registers don't overlap, e.g. this is illegal: + // r0 = load r2 + // r2 = load r0 + bool IsSMEM = false; + bool IsVMEM = false; + if (TII->isSMRD(Inst)) + IsSMEM = true; + else if (TII->usesVM_CNT(Inst)) + IsVMEM = true; + + ++Iter; + if (Iter == E) + break; + + MachineInstr &Next = *Iter; + + // TODO: How about consecutive SMEM instructions? + // The comments above says break the clause but the code does not. + // if ((TII->isSMRD(next) && isSMEM) || + if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM && + // TODO: Enable this check when hasSoftClause is upstreamed. + // ST->hasSoftClauses() && + ST->isXNACKEnabled()) { + // Insert a NOP to break the clause. + InsertNOP = true; + continue; + } + + // There must be "S_NOP 0" between an instruction writing M0 and + // S_SENDMSG. + if ((Next.getOpcode() == AMDGPU::S_SENDMSG || + Next.getOpcode() == AMDGPU::S_SENDMSGHALT) && + Inst.definesRegister(AMDGPU::M0)) + InsertNOP = true; + + continue; + } + + ++Iter; + } + + // Check if we need to force convergence at loop footer. + MachineLoop *ContainingLoop = MLI->getLoopFor(&Block); + if (ContainingLoop && loopBottom(ContainingLoop) == &Block) { + LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); + WaitcntData->print(); + DEBUG(dbgs() << '\n';); + + // The iterative waitcnt insertion algorithm aims for optimal waitcnt + // placement and doesn't always guarantee convergence for a loop. Each + // loop should take at most 2 iterations for it to converge naturally. + // When this max is reached and result doesn't converge, we force + // convergence by inserting a s_waitcnt at the end of loop footer. + if (WaitcntData->getIterCnt() > 2) { + // To ensure convergence, need to make wait events at loop footer be no + // more than those from the previous iteration. + // As a simplification, Instead of tracking individual scores and + // generate the precise wait count, just wait on 0. + bool HasPending = false; + MachineInstr *SWaitInst = WaitcntData->getWaitcnt(); + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { + ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); + HasPending = true; + } + } + + if (HasPending) { + if (!SWaitInst) { + SWaitInst = Block.getParent()->CreateMachineInstr( + TII->get(AMDGPU::S_WAITCNT), DebugLoc()); + CompilerGeneratedWaitcntSet.insert(SWaitInst); + const MachineOperand &Op = MachineOperand::CreateImm(0); + SWaitInst->addOperand(MF, Op); +#if 0 // TODO: Format the debug output + OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context); + OutputTransformAdd(SWaitInst, context); +#endif + } +#if 0 // TODO: ?? + _DEV( REPORTED_STATS->force_waitcnt_converge = 1; ) +#endif + } + + if (SWaitInst) { + DEBUG({ + SWaitInst->print(dbgs()); + dbgs() << "\nAdjusted score board:"; + ScoreBrackets->dump(); + }); + + // Add this waitcnt to the block. It is either newly created or + // created in previous iterations and added back since block traversal + // always remove waitcnt. + insertWaitcntBeforeCF(Block, SWaitInst); + WaitcntData->setWaitcnt(SWaitInst); + } + } + } +} + +bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget<SISubtarget>(); + TII = ST->getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + MLI = &getAnalysis<MachineLoopInfo>(); + IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + AMDGPUASI = ST->getAMDGPUAS(); + + HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); + HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); + HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); + + HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs(); + HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs(); + assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS); + assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS); + + RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0); + RegisterEncoding.VGPRL = + RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1; + RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0); + RegisterEncoding.SGPRL = + RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1; + + // Walk over the blocks in reverse post-dominator order, inserting + // s_waitcnt where needed. + ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); + bool Modified = false; + for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator + I = RPOT.begin(), + E = RPOT.end(), J = RPOT.begin(); + I != E;) { + MachineBasicBlock &MBB = **I; + + BlockVisitedSet.insert(&MBB); + + BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); + if (!ScoreBrackets) { + BlockWaitcntBracketsMap[&MBB] = make_unique<BlockWaitcntBrackets>(); + ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); + } + ScoreBrackets->setPostOrder(MBB.getNumber()); + MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB); + if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr) + LoopWaitcntDataMap[ContainingLoop] = make_unique<LoopWaitcntData>(); + + // If we are walking into the block from before the loop, then guarantee + // at least 1 re-walk over the loop to propagate the information, even if + // no S_WAITCNT instructions were generated. + if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I && + (BlockWaitcntProcessedSet.find(&MBB) == + BlockWaitcntProcessedSet.end())) { + BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); + DEBUG(dbgs() << "set-revisit: block" + << ContainingLoop->getHeader()->getNumber() << '\n';); + } + + // Walk over the instructions. + insertWaitcntInBlock(MF, MBB); + + // Flag that waitcnts have been processed at least once. + BlockWaitcntProcessedSet.insert(&MBB); + + // See if we want to revisit the loop. + if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) { + MachineBasicBlock *EntryBB = ContainingLoop->getHeader(); + BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get(); + if (EntrySB && EntrySB->getRevisitLoop()) { + EntrySB->setRevisitLoop(false); + J = I; + int32_t PostOrder = EntrySB->getPostOrder(); + // TODO: Avoid this loop. Find another way to set I. + for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator + X = RPOT.begin(), + Y = RPOT.end(); + X != Y; ++X) { + MachineBasicBlock &MBBX = **X; + if (MBBX.getNumber() == PostOrder) { + I = X; + break; + } + } + LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); + WaitcntData->incIterCnt(); + DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';); + continue; + } else { + LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); + // Loop converged, reset iteration count. If this loop gets revisited, + // it must be from an outer loop, the counter will restart, this will + // ensure we don't force convergence on such revisits. + WaitcntData->resetIterCnt(); + } + } + + J = I; + ++I; + } + + SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; + + bool HaveScalarStores = false; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; + ++BI) { + + MachineBasicBlock &MBB = *BI; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; + ++I) { + + if (!HaveScalarStores && TII->isScalarStore(*I)) + HaveScalarStores = true; + + if (I->getOpcode() == AMDGPU::S_ENDPGM || + I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) + EndPgmBlocks.push_back(&MBB); + } + } + + if (HaveScalarStores) { + // If scalar writes are used, the cache must be flushed or else the next + // wave to reuse the same scratch memory can be clobbered. + // + // Insert s_dcache_wb at wave termination points if there were any scalar + // stores, and only if the cache hasn't already been flushed. This could be + // improved by looking across blocks for flushes in postdominating blocks + // from the stores but an explicitly requested flush is probably very rare. + for (MachineBasicBlock *MBB : EndPgmBlocks) { + bool SeenDCacheWB = false; + + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + ++I) { + + if (I->getOpcode() == AMDGPU::S_DCACHE_WB) + SeenDCacheWB = true; + else if (TII->isScalarStore(*I)) + SeenDCacheWB = false; + + // FIXME: It would be better to insert this before a waitcnt if any. + if ((I->getOpcode() == AMDGPU::S_ENDPGM || + I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && + !SeenDCacheWB) { + Modified = true; + BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); + } + } + } + } + + if (!MFI->isEntryFunction()) { + // Wait for any outstanding memory operations that the input registers may + // depend on. We can't track them and it's better to to the wait after the + // costly call sequence. + + // TODO: Could insert earlier and schedule more liberally with operations + // that only use caller preserved registers. + MachineBasicBlock &EntryBB = MF.front(); + BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + Modified = true; + } + + return Modified; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp index fceabd7..bc86515 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -21,16 +21,32 @@ #include "SIDefines.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> +#include <new> +#include <utility> #define DEBUG_TYPE "si-insert-waits" using namespace llvm; -using namespace llvm::AMDGPU; namespace { @@ -42,7 +58,6 @@ typedef union { unsigned LGKM; } Named; unsigned Array[3]; - } Counters; typedef enum { @@ -55,13 +70,12 @@ typedef Counters RegCounters[512]; typedef std::pair<unsigned, unsigned> RegInterval; class SIInsertWaits : public MachineFunctionPass { - private: - const SISubtarget *ST; - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; + const SISubtarget *ST = nullptr; + const SIInstrInfo *TII = nullptr; + const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI; - IsaVersion IV; + AMDGPU::IsaInfo::IsaVersion ISA; /// \brief Constant zero value static const Counters ZeroCounts; @@ -86,7 +100,7 @@ private: RegCounters DefinedRegs; /// \brief Different export instruction types seen since last wait. - unsigned ExpInstrTypesSeen; + unsigned ExpInstrTypesSeen = 0; /// \brief Type of the last opcode. InstType LastOpcodeType; @@ -100,7 +114,7 @@ private: bool ReturnsVoid; /// Whether the VCCZ bit is possibly corrupt - bool VCCZCorrupt; + bool VCCZCorrupt = false; /// \brief Get increment/decrement amount for this instruction. Counters getHwCounts(MachineInstr &MI); @@ -141,13 +155,7 @@ private: public: static char ID; - SIInsertWaits() : - MachineFunctionPass(ID), - ST(nullptr), - TII(nullptr), - TRI(nullptr), - ExpInstrTypesSeen(0), - VCCZCorrupt(false) { } + SIInsertWaits() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -161,7 +169,7 @@ public: } }; -} // End anonymous namespace +} // end anonymous namespace INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE, "SI Insert Waits", false, false) @@ -208,8 +216,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { // XXX - What if this is a write into a super register? const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0); - unsigned Size = RC->getSize(); - Result.Named.LGKM = Size > 4 ? 2 : 1; + unsigned Size = TRI->getRegSizeInBits(*RC); + Result.Named.LGKM = Size > 32 ? 2 : 1; } else { // s_dcache_inv etc. do not have a a destination register. Assume we // want a wait on these. @@ -281,12 +289,12 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, const MachineOperand &Reg) const { - unsigned Size = RC->getSize(); - assert(Size >= 4); + unsigned Size = TRI->getRegSizeInBits(*RC); + assert(Size >= 32); RegInterval Result; Result.first = TRI->getEncodingValue(Reg.getReg()); - Result.second = Result.first + Size / 4; + Result.second = Result.first + Size / 32; return Result; } @@ -294,7 +302,6 @@ RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const Counters &Increment) { - // Get the hardware counter increments and sum them up Counters Limit = ZeroCounts; unsigned Sum = 0; @@ -366,7 +373,6 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const Counters &Required) { - // End of program? No need to wait on anything // A function not returning void needs to wait, because other bytecode will // be appended after it and we don't know what it will be. @@ -393,7 +399,6 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, bool NeedWait = false; for (unsigned i = 0; i < 3; ++i) { - if (Required.Array[i] <= WaitedOn.Array[i]) continue; @@ -421,10 +426,10 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, // Build the wait instruction BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(encodeWaitcnt(IV, - Counts.Named.VM, - Counts.Named.EXP, - Counts.Named.LGKM)); + .addImm(AMDGPU::encodeWaitcnt(ISA, + Counts.Named.VM, + Counts.Named.EXP, + Counts.Named.LGKM)); LastOpcodeType = OTHER; LastInstWritesM0 = false; @@ -434,7 +439,6 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, /// \brief helper function for handleOperands static void increaseCounters(Counters &Dst, const Counters &Src) { - for (unsigned i = 0; i < 3; ++i) Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); } @@ -453,9 +457,9 @@ void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) { unsigned Imm = I->getOperand(0).getImm(); Counters Counts, WaitOn; - Counts.Named.VM = decodeVmcnt(IV, Imm); - Counts.Named.EXP = decodeExpcnt(IV, Imm); - Counts.Named.LGKM = decodeLgkmcnt(IV, Imm); + Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm); + Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm); + Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm); for (unsigned i = 0; i < 3; ++i) { if (Counts.Array[i] <= LastIssued.Array[i]) @@ -468,7 +472,6 @@ void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) { } Counters SIInsertWaits::handleOperands(MachineInstr &MI) { - Counters Result = ZeroCounts; // For each register affected by this instruction increase the result @@ -484,7 +487,6 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { const TargetRegisterClass *RC = TII->getOpRegClass(MI, i); RegInterval Interval = getRegInterval(RC, Op); for (unsigned j = Interval.first; j < Interval.second; ++j) { - if (Op.isDef()) { increaseCounters(Result, UsedRegs[j]); increaseCounters(Result, DefinedRegs[j]); @@ -522,6 +524,16 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, } } +/// Return true if \p MBB has one successor immediately following, and is its +/// only predecessor +static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) { + if (MBB.succ_size() != 1) + return false; + + const MachineBasicBlock *Succ = *MBB.succ_begin(); + return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ); +} + // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" // around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { @@ -531,12 +543,12 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); - IV = getIsaVersion(ST->getFeatureBits()); + ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - HardwareLimits.Named.VM = getVmcntBitMask(IV); - HardwareLimits.Named.EXP = getExpcntBitMask(IV); - HardwareLimits.Named.LGKM = getLgkmcntBitMask(IV); + HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA); + HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA); + HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA); WaitedOn = ZeroCounts; DelayedWaitOn = ZeroCounts; @@ -618,7 +630,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { // but we also want to wait for any other outstanding transfers before // signalling other hardware blocks if ((I->getOpcode() == AMDGPU::S_BARRIER && - ST->needWaitcntBeforeBarrier()) || + !ST->hasAutoWaitcntBeforeBarrier()) || I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT) Required = LastIssued; @@ -636,12 +648,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { handleSendMsg(MBB, I); if (I->getOpcode() == AMDGPU::S_ENDPGM || - I->getOpcode() == AMDGPU::SI_RETURN) + I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) EndPgmBlocks.push_back(&MBB); } - // Wait for everything at the end of the MBB - Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + // Wait for everything at the end of the MBB. If there is only one + // successor, we can defer this until the uses there. + if (!hasTrivialSuccessor(MBB)) + Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } if (HaveScalarStores) { @@ -665,7 +679,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { // FIXME: It would be better to insert this before a waitcnt if any. if ((I->getOpcode() == AMDGPU::S_ENDPGM || - I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) { + I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) { Changes = true; BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); } @@ -676,5 +690,19 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { for (MachineInstr *I : RemoveMI) I->eraseFromParent(); + if (!MFI->isEntryFunction()) { + // Wait for any outstanding memory operations that the input registers may + // depend on. We can't track them and it's better to to the wait after the + // costly call sequence. + + // TODO: Could insert earlier and schedule more liberally with operations + // that only use caller preserved registers. + MachineBasicBlock &EntryBB = MF.front(); + BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + Changes = true; + } + return Changes; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 5523ec1..02c9b4b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -31,6 +31,7 @@ class InstSI <dag outs, dag ins, string asm = "", field bit VOP2 = 0; field bit VOPC = 0; field bit VOP3 = 0; + field bit VOP3P = 0; field bit VINTRP = 0; field bit SDWA = 0; field bit DPP = 0; @@ -78,6 +79,10 @@ class InstSI <dag outs, dag ins, string asm = "", // is unable to infer the encoding from the operands. field bit VOPAsmPrefer32Bit = 0; + // This bit indicates that this has a floating point result type, so + // the clamp modifier has floating point semantics. + field bit FPClamp = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -92,6 +97,7 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{8} = VOP2; let TSFlags{9} = VOPC; let TSFlags{10} = VOP3; + let TSFlags{12} = VOP3P; let TSFlags{13} = VINTRP; let TSFlags{14} = SDWA; @@ -120,6 +126,7 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{39} = ScalarStore; let TSFlags{40} = FixedSize; let TSFlags{41} = VOPAsmPrefer32Bit; + let TSFlags{42} = FPClamp; let SchedRW = [Write32Bit]; @@ -131,19 +138,19 @@ class InstSI <dag outs, dag ins, string asm = "", let AsmVariantName = AMDGPUAsmVariants.Default; } -class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []> - : InstSI<outs, ins, "", pattern> { +class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = ""> + : InstSI<outs, ins, asm, pattern> { let isPseudo = 1; let isCodeGenOnly = 1; } -class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = []> - : PseudoInstSI<outs, ins, pattern> { +class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = ""> + : PseudoInstSI<outs, ins, pattern, asm> { let SALU = 1; } -class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = []> - : PseudoInstSI<outs, ins, pattern> { +class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = ""> + : PseudoInstSI<outs, ins, pattern, asm> { let VALU = 1; let Uses = [EXEC]; } @@ -221,10 +228,10 @@ class EXPe : Enc64 { bits<1> compr; bits<1> done; bits<1> vm; - bits<8> vsrc0; - bits<8> vsrc1; - bits<8> vsrc2; - bits<8> vsrc3; + bits<8> src0; + bits<8> src1; + bits<8> src2; + bits<8> src3; let Inst{3-0} = en; let Inst{9-4} = tgt; @@ -232,10 +239,10 @@ class EXPe : Enc64 { let Inst{11} = done; let Inst{12} = vm; let Inst{31-26} = 0x3e; - let Inst{39-32} = vsrc0; - let Inst{47-40} = vsrc1; - let Inst{55-48} = vsrc2; - let Inst{63-56} = vsrc3; + let Inst{39-32} = src0; + let Inst{47-40} = src1; + let Inst{55-48} = src2; + let Inst{63-56} = src3; } let Uses = [EXEC] in { diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 26a8d22..a7e0feb 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -20,9 +20,10 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" -#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/Debug.h" @@ -36,7 +37,7 @@ BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)")); SIInstrInfo::SIInstrInfo(const SISubtarget &ST) - : AMDGPUInstrInfo(ST), RI(), ST(ST) {} + : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -137,6 +138,11 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, } if (isSMRD(Opc0) && isSMRD(Opc1)) { + // Skip time and cache invalidation instructions. + if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || + AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) + return false; + assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); // Check base reg. @@ -244,11 +250,11 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, unsigned EltSize; if (LdSt.mayLoad()) - EltSize = getOpRegClass(LdSt, 0)->getSize() / 2; + EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; else { assert(LdSt.mayStore()); int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); - EltSize = getOpRegClass(LdSt, Data0Idx)->getSize(); + EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; } if (isStride64(Opc)) @@ -315,7 +321,8 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, const MachineOperand *SecondDst = nullptr; if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || - (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) { + (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || + (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { @@ -343,7 +350,22 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, FirstLdSt.getParent()->getParent()->getRegInfo(); const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); - return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; + return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; +} + +static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) { + MachineFunction *MF = MBB.getParent(); + DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(), + "illegal SGPR to VGPR copy", + DL, DS_Error); + LLVMContext &C = MF->getFunction()->getContext(); + C.diagnose(IllegalCopy); + + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); } void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, @@ -369,7 +391,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); + return; + } + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; @@ -391,7 +417,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); + if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); + return; + } + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; @@ -408,15 +438,21 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isSGPRClass(RC)) { - if (RC->getSize() > 4) { + if (RI.getRegSizeInBits(*RC) > 32) { Opcode = AMDGPU::S_MOV_B64; EltSize = 8; } else { Opcode = AMDGPU::S_MOV_B32; EltSize = 4; } + + if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); + return; + } } + ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); @@ -432,13 +468,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); - if (Idx == SubIndices.size() - 1) - Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); - if (Idx == 0) Builder.addReg(DestReg, RegState::Define | RegState::Implicit); - Builder.addReg(SrcReg, RegState::Implicit); + bool UseKill = KillSrc && Idx == SubIndices.size() - 1; + Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); } } @@ -460,13 +494,195 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const { return Opcode; } +void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, + int64_t Value) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); + if (RegClass == &AMDGPU::SReg_32RegClass || + RegClass == &AMDGPU::SGPR_32RegClass || + RegClass == &AMDGPU::SReg_32_XM0RegClass || + RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) + .addImm(Value); + return; + } + + if (RegClass == &AMDGPU::SReg_64RegClass || + RegClass == &AMDGPU::SGPR_64RegClass || + RegClass == &AMDGPU::SReg_64_XEXECRegClass) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) + .addImm(Value); + return; + } + + if (RegClass == &AMDGPU::VGPR_32RegClass) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addImm(Value); + return; + } + if (RegClass == &AMDGPU::VReg_64RegClass) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) + .addImm(Value); + return; + } + + unsigned EltSize = 4; + unsigned Opcode = AMDGPU::V_MOV_B32_e32; + if (RI.isSGPRClass(RegClass)) { + if (RI.getRegSizeInBits(*RegClass) > 32) { + Opcode = AMDGPU::S_MOV_B64; + EltSize = 8; + } else { + Opcode = AMDGPU::S_MOV_B32; + EltSize = 4; + } + } + + ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); + for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { + int64_t IdxValue = Idx == 0 ? Value : 0; + + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, + get(Opcode), RI.getSubReg(DestReg, Idx)); + Builder.addImm(IdxValue); + } +} + +const TargetRegisterClass * +SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { + return &AMDGPU::VGPR_32RegClass; +} + +void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DstReg, + ArrayRef<MachineOperand> Cond, + unsigned TrueReg, + unsigned FalseReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && + "Not a VGPR32 reg"); + + if (Cond.size() == 1) { + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .add(Cond[0]); + } else if (Cond.size() == 2) { + assert(Cond[0].isImm() && "Cond[0] is not an immediate"); + switch (Cond[0].getImm()) { + case SIInstrInfo::SCC_TRUE: { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + .addImm(-1) + .addImm(0); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addReg(SReg); + break; + } + case SIInstrInfo::SCC_FALSE: { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + .addImm(0) + .addImm(-1); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addReg(SReg); + break; + } + case SIInstrInfo::VCCNZ: { + MachineOperand RegOp = Cond[1]; + RegOp.setImplicit(false); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .add(RegOp); + break; + } + case SIInstrInfo::VCCZ: { + MachineOperand RegOp = Cond[1]; + RegOp.setImplicit(false); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(TrueReg) + .addReg(FalseReg) + .add(RegOp); + break; + } + case SIInstrInfo::EXECNZ: { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) + .addImm(0); + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + .addImm(-1) + .addImm(0); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addReg(SReg); + break; + } + case SIInstrInfo::EXECZ: { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) + .addImm(0); + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + .addImm(0) + .addImm(-1); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addReg(SReg); + llvm_unreachable("Unhandled branch predicate EXECZ"); + break; + } + default: + llvm_unreachable("invalid branch predicate"); + } + } else { + llvm_unreachable("Can only handle Cond size 1 or 2"); + } +} + +unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + unsigned SrcReg, int Value) const { + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) + .addImm(Value) + .addReg(SrcReg); + + return Reg; +} + +unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + unsigned SrcReg, int Value) const { + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) + .addImm(Value) + .addReg(SrcReg); + + return Reg; +} + unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { - if (DstRC->getSize() == 4) { + if (RI.getRegSizeInBits(*DstRC) == 32) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; - } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { + } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { return AMDGPU::S_MOV_B64; - } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { + } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { return AMDGPU::V_MOV_B64_PSEUDO; } return AMDGPU::COPY; @@ -526,17 +742,18 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, Size, Align); + unsigned SpillSize = TRI->getSpillSize(*RC); if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling SGPRs. - const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize())); + const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); // The SGPR spill/restore instructions only work on number sgprs, so we need // to make sure we are using the correct register class. - if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { + if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } @@ -546,14 +763,14 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); + .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); // Add the scratch resource registers as implicit uses because we may end up // needing them, and need to ensure that the reserved registers are // correctly handled. if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. - Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); } return; @@ -571,13 +788,13 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); + unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addReg(MFI->getFrameOffsetReg()) // scratch_offset .addImm(0) // offset .addMemOperand(MMO); } @@ -629,6 +846,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, DebugLoc DL = MBB.findDebugLoc(MI); unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); unsigned Size = FrameInfo.getObjectSize(FrameIndex); + unsigned SpillSize = TRI->getSpillSize(*RC); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); @@ -639,8 +857,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (RI.isSGPRClass(RC)) { // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. - const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize())); - if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { + const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); + if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } @@ -649,11 +867,11 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); + .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. - Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); } return; @@ -670,12 +888,12 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); + unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // vaddr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset - .addImm(0) // offset + .addFrameIndex(FrameIndex) // vaddr + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getFrameOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } @@ -796,6 +1014,20 @@ void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, insertWaitStates(MBB, MI, 1); } +void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { + auto MF = MBB.getParent(); + SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); + + assert(Info->isEntryFunction()); + + if (MBB.succ_empty()) { + bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); + if (HasNoTerminator) + BuildMI(MBB, MBB.end(), DebugLoc(), + get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG)); + } +} + unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return 1; // FIXME: Do wait states equal cycles? @@ -870,9 +1102,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineInstr *MovRel = BuildMI(MBB, MI, DL, MovRelDesc) .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) - .addOperand(MI.getOperand(2)) + .add(MI.getOperand(2)) .addReg(VecReg, RegState::ImplicitDefine) - .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); + .addReg(VecReg, + RegState::Implicit | (IsUndef ? RegState::Undef : 0)); const int ImpDefIdx = MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); @@ -897,14 +1130,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // constant data. Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) .addReg(RegLo) - .addOperand(MI.getOperand(1))); + .add(MI.getOperand(1))); MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) .addReg(RegHi); if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) MIB.addImm(0); else - MIB.addOperand(MI.getOperand(2)); + MIB.add(MI.getOperand(2)); Bundler.append(MIB); llvm::finalizeBundle(MBB, Bundler.begin()); @@ -1202,14 +1435,20 @@ bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, return false; } - BranchPredicate Pred = getBranchPredicate(I->getOpcode()); - if (Pred == INVALID_BR) - return true; + MachineBasicBlock *CondBB = nullptr; - MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); - Cond.push_back(MachineOperand::CreateImm(Pred)); - Cond.push_back(I->getOperand(1)); // Save the branch register. + if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { + CondBB = I->getOperand(1).getMBB(); + Cond.push_back(I->getOperand(0)); + } else { + BranchPredicate Pred = getBranchPredicate(I->getOpcode()); + if (Pred == INVALID_BR) + return true; + CondBB = I->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(Pred)); + Cond.push_back(I->getOperand(1)); // Save the branch register. + } ++I; if (I == MBB.end()) { @@ -1290,6 +1529,13 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, return Count; } +// Copy the flags onto the implicit condition register operand. +static void preserveCondRegFlags(MachineOperand &CondReg, + const MachineOperand &OrigCond) { + CondReg.setIsUndef(OrigCond.isUndef()); + CondReg.setIsKill(OrigCond.isKill()); +} + unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, @@ -1305,6 +1551,13 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, return 1; } + if(Cond.size() == 1 && Cond[0].isReg()) { + BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) + .add(Cond[0]) + .addMBB(TBB); + return 1; + } + assert(TBB && Cond[0].isImm()); unsigned Opcode @@ -1317,9 +1570,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, .addMBB(TBB); // Copy the flags onto the implicit condition register operand. - MachineOperand &CondReg = CondBr->getOperand(1); - CondReg.setIsUndef(Cond[1].isUndef()); - CondReg.setIsKill(Cond[1].isKill()); + preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); if (BytesAdded) *BytesAdded = 4; @@ -1346,9 +1597,167 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, bool SIInstrInfo::reverseBranchCondition( SmallVectorImpl<MachineOperand> &Cond) const { - assert(Cond.size() == 2); - Cond[0].setImm(-Cond[0].getImm()); - return false; + if (Cond.size() != 2) { + return true; + } + + if (Cond[0].isImm()) { + Cond[0].setImm(-Cond[0].getImm()); + return false; + } + + return true; +} + +bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, + int &TrueCycles, int &FalseCycles) const { + switch (Cond[0].getImm()) { + case VCCNZ: + case VCCZ: { + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); + assert(MRI.getRegClass(FalseReg) == RC); + + int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; + CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? + + // Limit to equal cost for branch vs. N v_cndmask_b32s. + return !RI.isSGPRClass(RC) && NumInsts <= 6; + } + case SCC_TRUE: + case SCC_FALSE: { + // FIXME: We could insert for VGPRs if we could replace the original compare + // with a vector one. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); + assert(MRI.getRegClass(FalseReg) == RC); + + int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; + + // Multiples of 8 can do s_cselect_b64 + if (NumInsts % 2 == 0) + NumInsts /= 2; + + CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? + return RI.isSGPRClass(RC); + } + default: + return false; + } +} + +void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const { + BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); + if (Pred == VCCZ || Pred == SCC_FALSE) { + Pred = static_cast<BranchPredicate>(-Pred); + std::swap(TrueReg, FalseReg); + } + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + unsigned DstSize = RI.getRegSizeInBits(*DstRC); + + if (DstSize == 32) { + unsigned SelOp = Pred == SCC_TRUE ? + AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; + + // Instruction's operands are backwards from what is expected. + MachineInstr *Select = + BuildMI(MBB, I, DL, get(SelOp), DstReg) + .addReg(FalseReg) + .addReg(TrueReg); + + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + return; + } + + if (DstSize == 64 && Pred == SCC_TRUE) { + MachineInstr *Select = + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg); + + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + return; + } + + static const int16_t Sub0_15[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + }; + + static const int16_t Sub0_15_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, + }; + + unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; + const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; + const int16_t *SubIndices = Sub0_15; + int NElts = DstSize / 32; + + // 64-bit select is only avaialble for SALU. + if (Pred == SCC_TRUE) { + SelOp = AMDGPU::S_CSELECT_B64; + EltRC = &AMDGPU::SGPR_64RegClass; + SubIndices = Sub0_15_64; + + assert(NElts % 2 == 0); + NElts /= 2; + } + + MachineInstrBuilder MIB = BuildMI( + MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); + + I = MIB->getIterator(); + + SmallVector<unsigned, 8> Regs; + for (int Idx = 0; Idx != NElts; ++Idx) { + unsigned DstElt = MRI.createVirtualRegister(EltRC); + Regs.push_back(DstElt); + + unsigned SubIdx = SubIndices[Idx]; + + MachineInstr *Select = + BuildMI(MBB, I, DL, get(SelOp), DstElt) + .addReg(FalseReg, 0, SubIdx) + .addReg(TrueReg, 0, SubIdx); + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + + MIB.addReg(DstElt) + .addImm(SubIdx); + } +} + +bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: { + // If there are additional implicit register operands, this may be used for + // register indexing so the source register operand isn't simply copied. + unsigned NumOps = MI.getDesc().getNumOperands() + + MI.getDesc().getNumImplicitUses(); + + return MI.getNumOperands() == NumOps; + } + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::COPY: + return true; + default: + return false; + } } static void removeModOperands(MachineInstr &MI) { @@ -1400,15 +1809,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { - bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; - - // Don't fold if we are using source modifiers. The new VOP2 instructions - // don't have them. - if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) || - hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) { + // Don't fold if we are using source or output modifiers. The new VOP2 + // instructions don't have them. + if (hasAnyModifiersSet(UseMI)) return false; - } const MachineOperand &ImmOp = DefMI.getOperand(1); @@ -1421,6 +1825,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (isInlineConstant(UseMI, *Src0, ImmOp)) return false; + bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); @@ -1617,10 +2022,12 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, return nullptr; case AMDGPU::V_MAC_F16_e64: IsF16 = true; + LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e64: break; case AMDGPU::V_MAC_F16_e32: IsF16 = true; + LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e32: { int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); @@ -1633,20 +2040,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src0Mods = + getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src1Mods = + getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); + const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); + const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); return BuildMI(*MBB, MI, MI.getDebugLoc(), get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) - .addOperand(*Dst) - .addImm(0) // Src0 mods - .addOperand(*Src0) - .addImm(0) // Src1 mods - .addOperand(*Src1) + .add(*Dst) + .addImm(Src0Mods ? Src0Mods->getImm() : 0) + .add(*Src0) + .addImm(Src1Mods ? Src1Mods->getImm() : 0) + .add(*Src1) .addImm(0) // Src mods - .addOperand(*Src2) - .addImm(0) // clamp - .addImm(0); // omod + .add(*Src2) + .addImm(Clamp ? Clamp->getImm() : 0) + .addImm(Omod ? Omod->getImm() : 0); } // It's not generally safe to move VALU instructions across these since it will @@ -1687,7 +2100,8 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); case 16: - return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); default: llvm_unreachable("invalid bitwidth"); @@ -1696,7 +2110,9 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const { - if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET) + if (!MO.isImm() || + OperandType < AMDGPU::OPERAND_SRC_FIRST || + OperandType > AMDGPU::OPERAND_SRC_LAST) return false; // MachineOperand provides no way to tell the true operand size, since it only @@ -1705,24 +2121,43 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, // would be for any 32-bit integer operand, but would not be for a 64-bit one. int64_t Imm = MO.getImm(); - switch (operandBitWidth(OperandType)) { - case 32: { + switch (OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: { int32_t Trunc = static_cast<int32_t>(Imm); return Trunc == Imm && AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); } - case 64: { + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: { return AMDGPU::isInlinableLiteral64(MO.getImm(), ST.hasInv2PiInlineImm()); } - case 16: { + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { + // A few special case instructions have 16-bit operands on subtargets + // where 16-bit instructions are not legal. + // TODO: Do the 32-bit immediates work? We shouldn't really need to handle + // constants in these cases int16_t Trunc = static_cast<int16_t>(Imm); - return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); } return false; } + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + uint32_t Trunc = static_cast<uint32_t>(Imm); + return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); + } default: llvm_unreachable("invalid bitwidth"); } @@ -1801,6 +2236,14 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, return Mods && Mods->getImm(); } +bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { + return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || + hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || + hasModifiersSet(MI, AMDGPU::OpName::clamp) || + hasModifiersSet(MI, AMDGPU::OpName::omod); +} + bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const { @@ -1890,7 +2333,12 @@ static bool isSubRegOf(const SIRegisterInfo &TRI, bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { uint16_t Opcode = MI.getOpcode(); - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) + return true; + + const MachineFunction *MF = MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); @@ -1989,8 +2437,77 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + // Verify SDWA + if (isSDWA(MI)) { + + if (!ST.hasSDWA()) { + ErrInfo = "SDWA is not supported on this target"; + return false; + } + + int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); + + const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; + + for (int OpIdx: OpIndicies) { + if (OpIdx == -1) + continue; + const MachineOperand &MO = MI.getOperand(OpIdx); + + if (!ST.hasSDWAScalar()) { + // Only VGPRS on VI + if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { + ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; + return false; + } + } else { + // No immediates on GFX9 + if (!MO.isReg()) { + ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; + return false; + } + } + } + + if (!ST.hasSDWAOmod()) { + // No omod allowed on VI + const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); + if (OMod != nullptr && + (!OMod->isImm() || OMod->getImm() != 0)) { + ErrInfo = "OMod not allowed in SDWA instructions on VI"; + return false; + } + } + + uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); + if (isVOPC(BasicOpcode)) { + if (!ST.hasSDWASdst() && DstIdx != -1) { + // Only vcc allowed as dst on VI for VOPC + const MachineOperand &Dst = MI.getOperand(DstIdx); + if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { + ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; + return false; + } + } else if (!ST.hasSDWAOutModsVOPC()) { + // No clamp allowed on GFX9 for VOPC + const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); + if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { + ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; + return false; + } + + // No omod allowed on GFX9 for VOPC + const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); + if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { + ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; + return false; + } + } + } + } + // Verify VOP* - if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { + if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) { // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. @@ -2120,6 +2637,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) { + const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); + if (Offset->getImm() != 0) { + ErrInfo = "subtarget does not support offsets in flat instructions"; + return false; + } + } + return true; } @@ -2238,7 +2763,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { unsigned Reg = MRI.createVirtualRegister(VRC); DebugLoc DL = MBB->findDebugLoc(I); - BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO); + BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); MO.ChangeToRegister(Reg, false); } @@ -2417,6 +2942,19 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) return; + // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for + // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane + // select is uniform. + if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && + RI.isVGPR(MRI, Src1.getReg())) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src1); + Src1.ChangeToRegister(Reg, false); + return; + } + // We do not use commuteInstruction here because it is too aggressive and will // commute if it is possible. We only want to commute here if it improves // legality. This can be called a fairly large number of times so don't waste @@ -2511,7 +3049,7 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); unsigned DstReg = MRI.createVirtualRegister(SRC); - unsigned SubRegs = VRC->getSize() / 4; + unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; SmallVector<unsigned, 8> SRegs; for (unsigned i = 0; i < SubRegs; ++i) { @@ -2564,8 +3102,8 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, return; unsigned DstReg = MRI.createVirtualRegister(DstRC); - MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg) - .addOperand(Op); + MachineInstr *Copy = + BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); Op.setReg(DstReg); Op.setSubReg(0); @@ -2810,13 +3348,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { // Regular buffer load / store. MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) + .add(*VData) .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. // This will be replaced later // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset); + .add(*SRsrc) + .add(*SOffset) + .add(*Offset); // Atomics do not have this operand. if (const MachineOperand *GLC = @@ -2836,14 +3374,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { } else { // Atomics with return. Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addOperand(*VDataIn) + .add(*VData) + .add(*VDataIn) .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. // This will be replaced later // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset) + .add(*SRsrc) + .add(*SOffset) + .add(*Offset) .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); } @@ -2870,8 +3408,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { } void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { - SmallVector<MachineInstr *, 128> Worklist; - Worklist.push_back(&TopInst); + SetVectorType Worklist; + Worklist.insert(&TopInst); while (!Worklist.empty()) { MachineInstr &Inst = *Worklist.pop_back_val(); @@ -2970,6 +3508,14 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); + + case AMDGPU::S_PACK_LL_B32_B16: + case AMDGPU::S_PACK_LH_B32_B16: + case AMDGPU::S_PACK_HH_B32_B16: { + movePackToVALU(Worklist, MRI, Inst); + Inst.eraseFromParent(); + continue; + } } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -3027,12 +3573,15 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); unsigned NewDstReg = AMDGPU::NoRegister; if (HasDst) { + unsigned DstReg = Inst.getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + continue; + // Update the destination register class. const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); if (!NewDstRC) continue; - unsigned DstReg = Inst.getOperand(0).getReg(); if (Inst.isCopy() && TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { @@ -3061,7 +3610,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } } -void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, +void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3086,7 +3635,7 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, } void SIInstrInfo::splitScalar64BitUnaryOp( - SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, + SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3112,15 +3661,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp( const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0); + BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1); + BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -3139,7 +3686,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp( } void SIInstrInfo::splitScalar64BitBinaryOp( - SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, + SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3174,8 +3721,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp( unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0) - .addOperand(SrcReg1Sub0); + .add(SrcReg0Sub0) + .add(SrcReg1Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); @@ -3184,8 +3731,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp( unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1) - .addOperand(SrcReg1Sub1); + .add(SrcReg0Sub1) + .add(SrcReg1Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -3206,7 +3753,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp( } void SIInstrInfo::splitScalar64BitBCNT( - SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const { + SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3231,13 +3778,9 @@ void SIInstrInfo::splitScalar64BitBCNT( MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC); - BuildMI(MBB, MII, DL, InstDesc, MidReg) - .addOperand(SrcRegSub0) - .addImm(0); + BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); - BuildMI(MBB, MII, DL, InstDesc, ResultReg) - .addOperand(SrcRegSub1) - .addReg(MidReg); + BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); MRI.replaceRegWith(Dest.getReg(), ResultReg); @@ -3246,7 +3789,7 @@ void SIInstrInfo::splitScalar64BitBCNT( addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, +void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3310,12 +3853,12 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, void SIInstrInfo::addUsersToMoveToVALUWorklist( unsigned DstReg, MachineRegisterInfo &MRI, - SmallVectorImpl<MachineInstr *> &Worklist) const { + SetVectorType &Worklist) const { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), E = MRI.use_end(); I != E;) { MachineInstr &UseMI = *I->getParent(); if (!canReadVGPR(UseMI, I.getOperandNo())) { - Worklist.push_back(&UseMI); + Worklist.insert(&UseMI); do { ++I; @@ -3326,8 +3869,70 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( } } +void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, + MachineRegisterInfo &MRI, + MachineInstr &Inst) const { + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineBasicBlock *MBB = Inst.getParent(); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + + switch (Inst.getOpcode()) { + case AMDGPU::S_PACK_LL_B32_B16: { + unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // FIXME: Can do a lot better if we know the high bits of src0 or src1 are + // 0. + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(0xffff); + + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) + .addReg(ImmReg, RegState::Kill) + .add(Src0); + + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) + .add(Src1) + .addImm(16) + .addReg(TmpReg, RegState::Kill); + break; + } + case AMDGPU::S_PACK_LH_B32_B16: { + unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(0xffff); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) + .addReg(ImmReg, RegState::Kill) + .add(Src0) + .add(Src1); + break; + } + case AMDGPU::S_PACK_HH_B32_B16: { + unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) + .addImm(16) + .add(Src0); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(0xffff0000); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) + .add(Src1) + .addReg(ImmReg, RegState::Kill) + .addReg(TmpReg, RegState::Kill); + break; + } + default: + llvm_unreachable("unhandled s_pack_* instruction"); + } + + MachineOperand &Dest = Inst.getOperand(0); + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::addSCCDefUsersToVALUWorklist( - MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { + MachineInstr &SCCDefInst, SetVectorType &Worklist) const { // This assumes that all the users of SCC are in the same block // as the SCC def. for (MachineInstr &MI : @@ -3338,7 +3943,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist( return; if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) - Worklist.push_back(&MI); + Worklist.insert(&MI); } } @@ -3448,10 +4053,13 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; if (ST.isAmdHsaOS()) { - RsrcDataFormat |= (1ULL << 56); + // Set ATC = 1. GFX9 doesn't have this bit. + if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) + RsrcDataFormat |= (1ULL << 56); - if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) - // Set MTYPE = 2 + // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. + // BTW, it disables TC L2 and therefore decreases performance. + if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS) RsrcDataFormat |= (2ULL << 59); } @@ -3463,11 +4071,14 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { AMDGPU::RSRC_TID_ENABLE | 0xffffffff; // Size; - uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; + // GFX9 doesn't have ELEMENT_SIZE. + if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) { + uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; + Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; + } - Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) | - // IndexStride = 64 - (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT); + // IndexStride = 64. + Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. @@ -3496,7 +4107,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, return AMDGPU::NoRegister; assert(!MI.memoperands_empty() && - (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); + (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS); FrameIndex = Addr->getIndex(); return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); @@ -3552,16 +4163,11 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { if (DescSize != 0 && DescSize != 4) return DescSize; - if (Opc == AMDGPU::WAVE_BARRIER) - return 0; - // 4-byte instructions may have a 32-bit literal encoded after them. Check // operands that coud ever be literals. if (isVALU(MI) || isSALU(MI)) { - if (isFixedSize(MI)) { - assert(DescSize == 4); + if (isFixedSize(MI)) return DescSize; - } int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) @@ -3584,7 +4190,6 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return 4; switch (Opc) { - case AMDGPU::SI_MASK_BRANCH: case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: case TargetOpcode::DBG_VALUE: @@ -3609,12 +4214,88 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { return true; for (const MachineMemOperand *MMO : MI.memoperands()) { - if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) + if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS) return true; } return false; } +bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { + return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; +} + +void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, + MachineBasicBlock *IfEnd) const { + MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); + assert(TI != IfEntry->end()); + + MachineInstr *Branch = &(*TI); + MachineFunction *MF = IfEntry->getParent(); + MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); + + if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { + unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstr *SIIF = + BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) + .add(Branch->getOperand(0)) + .add(Branch->getOperand(1)); + MachineInstr *SIEND = + BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) + .addReg(DstReg); + + IfEntry->erase(TI); + IfEntry->insert(IfEntry->end(), SIIF); + IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); + } +} + +void SIInstrInfo::convertNonUniformLoopRegion( + MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { + MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); + // We expect 2 terminators, one conditional and one unconditional. + assert(TI != LoopEnd->end()); + + MachineInstr *Branch = &(*TI); + MachineFunction *MF = LoopEnd->getParent(); + MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); + + if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { + + unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstrBuilder HeaderPHIBuilder = + BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); + for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), + E = LoopEntry->pred_end(); + PI != E; ++PI) { + if (*PI == LoopEnd) { + HeaderPHIBuilder.addReg(BackEdgeReg); + } else { + MachineBasicBlock *PMBB = *PI; + unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), + ZeroReg, 0); + HeaderPHIBuilder.addReg(ZeroReg); + } + HeaderPHIBuilder.addMBB(*PI); + } + MachineInstr *HeaderPhi = HeaderPHIBuilder; + MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), + get(AMDGPU::SI_IF_BREAK), BackEdgeReg) + .addReg(DstReg) + .add(Branch->getOperand(0)); + MachineInstr *SILOOP = + BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) + .addReg(BackEdgeReg) + .addMBB(LoopEntry); + + LoopEntry->insert(LoopEntry->begin(), HeaderPhi); + LoopEnd->erase(TI); + LoopEnd->insert(LoopEnd->end(), SIIFBREAK); + LoopEnd->insert(LoopEnd->end(), SILOOP); + } +} + ArrayRef<std::pair<int, const char *>> SIInstrInfo::getSerializableTargetIndices() const { static const std::pair<int, const char *> TargetIndices[] = { @@ -3640,3 +4321,39 @@ ScheduleHazardRecognizer * SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { return new GCNHazardRecognizer(MF); } + +std::pair<unsigned, unsigned> +SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); +} + +ArrayRef<std::pair<unsigned, const char *>> +SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + static const std::pair<unsigned, const char *> TargetFlags[] = { + { MO_GOTPCREL, "amdgpu-gotprel" }, + { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, + { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, + { MO_REL32_LO, "amdgpu-rel32-lo" }, + { MO_REL32_HI, "amdgpu-rel32-hi" } + }; + + return makeArrayRef(TargetFlags); +} + +bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { + return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && + MI.modifiesRegister(AMDGPU::EXEC, &RI); +} + +MachineInstrBuilder +SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + unsigned DestReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) + .addReg(UnusedCarry, RegState::Define | RegState::Dead); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e68f6f9..3dd5bc8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -19,6 +19,7 @@ #include "AMDGPUInstrInfo.h" #include "SIDefines.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/SetVector.h" namespace llvm { @@ -38,6 +39,8 @@ private: EXECZ = 3 }; + typedef SmallSetVector<MachineInstr *, 32> SetVectorType; + static unsigned getBranchOpcode(BranchPredicate Cond); static BranchPredicate getBranchPredicate(unsigned Opcode); @@ -56,27 +59,30 @@ private: void swapOperands(MachineInstr &Inst) const; - void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, + void lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const; - void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist, + void splitScalar64BitUnaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist, + void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, + void splitScalar64BitBCNT(SetVectorType &Worklist, MachineInstr &Inst) const; - void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, + void splitScalar64BitBFE(SetVectorType &Worklist, MachineInstr &Inst) const; + void movePackToVALU(SetVectorType &Worklist, + MachineRegisterInfo &MRI, + MachineInstr &Inst) const; void addUsersToMoveToVALUWorklist( unsigned Reg, MachineRegisterInfo &MRI, - SmallVectorImpl<MachineInstr *> &Worklist) const; + SetVectorType &Worklist) const; void addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst, - SmallVectorImpl<MachineInstr *> &Worklist) const; + SetVectorType &Worklist) const; const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; @@ -97,6 +103,8 @@ protected: public: enum TargetOperandFlags { + MO_MASK = 0x7, + MO_NONE = 0, // MO_GOTPCREL -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL. MO_GOTPCREL = 1, @@ -140,6 +148,23 @@ public: RegScavenger *RS, unsigned TmpReg, unsigned Offset, unsigned Size) const; + void materializeImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, + unsigned DestReg, + int64_t Value) const; + + const TargetRegisterClass *getPreferredSelectRegClass( + unsigned Size) const; + + unsigned insertNE(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned SrcReg, int Value) const; + + unsigned insertEQ(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned SrcReg, int Value) const; + void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIndex, @@ -190,7 +215,7 @@ public: bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const override; + bool AllowModify = false) const override; unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved = nullptr) const override; @@ -203,10 +228,29 @@ public: bool reverseBranchCondition( SmallVectorImpl<MachineOperand> &Cond) const override; + + bool canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, + int &TrueCycles, int &FalseCycles) const override; + + void insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const override; + + void insertVectorSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const; + bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; + bool isFoldableCopy(const MachineInstr &MI) const; + bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, MachineRegisterInfo *MRI) const final; @@ -308,6 +352,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::VOP3; } + static bool isSDWA(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SDWA; + } + + bool isSDWA(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SDWA; + } + static bool isVOPC(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::VOPC; } @@ -420,6 +472,22 @@ public: return get(Opcode).TSFlags & SIInstrFlags::DPP; } + static bool isVOP3P(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP3P; + } + + bool isVOP3P(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP3P; + } + + static bool isVINTRP(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VINTRP; + } + + bool isVINTRP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VINTRP; + } + static bool isScalarUnit(const MachineInstr &MI) { return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD); } @@ -454,6 +522,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE; } + static bool hasFPClamp(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::HasFPClamp; + } + + bool hasFPClamp(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::HasFPClamp; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); @@ -462,28 +538,6 @@ public: return !RI.isSGPRReg(MRI, Dest); } - static int operandBitWidth(uint8_t OperandType) { - switch (OperandType) { - case AMDGPU::OPERAND_REG_IMM_INT32: - case AMDGPU::OPERAND_REG_IMM_FP32: - case AMDGPU::OPERAND_REG_INLINE_C_INT32: - case AMDGPU::OPERAND_REG_INLINE_C_FP32: - return 32; - case AMDGPU::OPERAND_REG_IMM_INT64: - case AMDGPU::OPERAND_REG_IMM_FP64: - case AMDGPU::OPERAND_REG_INLINE_C_INT64: - case AMDGPU::OPERAND_REG_INLINE_C_FP64: - return 64; - case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_IMM_INT16: - case AMDGPU::OPERAND_REG_IMM_FP16: - return 16; - default: - llvm_unreachable("unexpected operand type"); - } - } - bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const; @@ -571,6 +625,7 @@ public: bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const; + bool hasAnyModifiersSet(const MachineInstr &MI) const; bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override; @@ -598,13 +653,13 @@ public: return 4; } - return RI.getRegClass(OpInfo.RegClass)->getSize(); + return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8; } /// \brief This form should usually be preferred since it handles operands /// with unknown register classes. unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const { - return getOpRegClass(MI, OpNo)->getSize(); + return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8; } /// \returns true if it is legal for the operand at index \p OpNo @@ -677,6 +732,7 @@ public: void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + void insertReturn(MachineBasicBlock &MBB) const; /// \brief Return the number of wait states that result from executing this /// instruction. unsigned getNumWaitStates(const MachineInstr &MI) const; @@ -722,15 +778,40 @@ public: bool mayAccessFlatAddressSpace(const MachineInstr &MI) const; + bool isNonUniformBranchInstr(MachineInstr &Instr) const; + + void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, + MachineBasicBlock *IfEnd) const; + + void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, + MachineBasicBlock *LoopEnd) const; + + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + ArrayRef<std::pair<int, const char *>> getSerializableTargetIndices() const override; + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override; ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override; + + bool isBasicBlockPrologue(const MachineInstr &MI) const override; + + /// \brief Return a partially built integer add instruction without carry. + /// Caller must add source operands. + /// For pre-GFX9 it will generate unused carry destination operand. + /// TODO: After GFX9 it should return a no-carry operation. + MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + unsigned DestReg) const; }; namespace AMDGPU { @@ -741,6 +822,12 @@ namespace AMDGPU { int getVOPe32(uint16_t Opcode); LLVM_READONLY + int getSDWAOp(uint16_t Opcode); + + LLVM_READONLY + int getBasicFromSDWAOp(uint16_t Opcode); + + LLVM_READONLY int getCommuteRev(uint16_t Opcode); LLVM_READONLY diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td index ebaefae..0881736 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -20,6 +20,8 @@ def SIEncodingFamily { int NONE = -1; int SI = 0; int VI = 1; + int SDWA = 2; + int SDWA9 = 3; } //===----------------------------------------------------------------------===// @@ -39,25 +41,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", - SDTypeProfile<0, 13, - [SDTCisVT<0, v4i32>, // rsrc(SGPR) - SDTCisVT<1, iAny>, // vdata(VGPR) - SDTCisVT<2, i32>, // num_channels(imm) - SDTCisVT<3, i32>, // vaddr(VGPR) +def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", + SDTypeProfile<1, 9, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) - SDTCisVT<5, i32>, // inst_offset(imm) + SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // dfmt(imm) SDTCisVT<7, i32>, // nfmt(imm) - SDTCisVT<8, i32>, // offen(imm) - SDTCisVT<9, i32>, // idxen(imm) - SDTCisVT<10, i32>, // glc(imm) - SDTCisVT<11, i32>, // slc(imm) - SDTCisVT<12, i32> // tfe(imm) + SDTCisVT<8, i32>, // glc(imm) + SDTCisVT<9, i32> // slc(imm) ]>, - [SDNPMayStore, SDNPMemOperand, SDNPHasChain] + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] >; +def SDTtbuffer_store : SDTypeProfile<0, 10, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // dfmt(imm) + SDTCisVT<7, i32>, // nfmt(imm) + SDTCisVT<8, i32>, // glc(imm) + SDTCisVT<9, i32> // slc(imm) + ]>; + +def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3", + SDTtbuffer_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; + def SDTBufferLoad : SDTypeProfile<1, 5, [ // vdata SDTCisVT<1, v4i32>, // rsrc @@ -71,11 +89,6 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; -def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", - SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>, - SDTCisVT<3, i32>]> ->; - class SDSample<string opcode> : SDNode <opcode, SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>, SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> @@ -107,7 +120,7 @@ def SIld_local : SDNode <"ISD::LOAD", SDTLoad, >; def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{ - return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ @@ -144,7 +157,7 @@ def SIst_local : SDNode <"ISD::STORE", SDTStore, def si_st_local : PatFrag < (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; def si_store_local : PatFrag < @@ -196,6 +209,21 @@ def si_uniform_br_scc : PatFrag < return isCBranchSCC(N); }]>; +def lshr_rev : PatFrag < + (ops node:$src1, node:$src0), + (srl $src0, $src1) +>; + +def ashr_rev : PatFrag < + (ops node:$src1, node:$src0), + (sra $src0, $src1) +>; + +def lshl_rev : PatFrag < + (ops node:$src1, node:$src0), + (shl $src0, $src1) +>; + multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> { def _glue : SDNode < @@ -266,10 +294,6 @@ def SIMM16bit : PatLeaf <(imm), [{return isInt<16>(N->getSExtValue());}] >; -def IMM20bit : PatLeaf <(imm), - [{return isUInt<20>(N->getZExtValue());}] ->; - class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{ return isInlineImmediate(N); }]>; @@ -299,6 +323,23 @@ class VGPRImm <dag frag> : PatLeaf<frag, [{ return Limit < 10; }]>; +def NegateImm : SDNodeXForm<imm, [{ + return CurDAG->getConstant(-N->getSExtValue(), SDLoc(N), MVT::i32); +}]>; + +// TODO: When FP inline imm values work? +def NegSubInlineConst32 : ImmLeaf<i32, [{ + return Imm < -16 && Imm >= -64; +}], NegateImm>; + +def NegSubInlineConst16 : ImmLeaf<i16, [{ + return Imm < -16 && Imm >= -64; +}], NegateImm>; + +def ShiftAmt32Imm : PatLeaf <(imm), [{ + return N->getZExtValue() < 32; +}]>; + //===----------------------------------------------------------------------===// // Custom Operands //===----------------------------------------------------------------------===// @@ -364,6 +405,14 @@ def SendMsgMatchClass : AsmOperandClass { let RenderMethod = "addImmOperands"; } +def SwizzleMatchClass : AsmOperandClass { + let Name = "Swizzle"; + let PredicateMethod = "isSwizzle"; + let ParserMethod = "parseSwizzleOp"; + let RenderMethod = "addImmOperands"; + let IsOptional = 1; +} + def ExpTgtMatchClass : AsmOperandClass { let Name = "ExpTgt"; let PredicateMethod = "isExpTgt"; @@ -376,6 +425,11 @@ def SendMsgImm : Operand<i32> { let ParserMatchClass = SendMsgMatchClass; } +def SwizzleImm : Operand<i16> { + let PrintMethod = "printSwizzle"; + let ParserMatchClass = SwizzleMatchClass; +} + def SWaitMatchClass : AsmOperandClass { let Name = "SWaitCnt"; let RenderMethod = "addImmOperands"; @@ -420,6 +474,27 @@ def ExpSrc3 : RegisterOperand<VGPR_32> { let ParserMatchClass = VReg32OrOffClass; } +class SDWASrc : RegisterOperand<VS_32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_SDWA_SRC"; + let EncoderMethod = "getSDWASrcEncoding"; +} + +def SDWASrc32 : SDWASrc { + let DecoderMethod = "decodeSDWASrc32"; +} + +def SDWASrc16 : SDWASrc { + let DecoderMethod = "decodeSDWASrc16"; +} + +def SDWAVopcDst : VOPDstOperand<SReg_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_SDWA_VOPC_DST"; + let EncoderMethod = "getSDWAVopcDstEncoding"; + let DecoderMethod = "decodeSDWAVopcDst"; +} + class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass { let Name = "Imm"#CName; let PredicateMethod = "is"#CName; @@ -439,22 +514,40 @@ class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> { let ParserMatchClass = MatchClass; } +class NamedOperandU12<string Name, AsmOperandClass MatchClass> : Operand<i16> { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; +} + class NamedOperandU16<string Name, AsmOperandClass MatchClass> : Operand<i16> { let PrintMethod = "print"#Name; let ParserMatchClass = MatchClass; } +class NamedOperandS13<string Name, AsmOperandClass MatchClass> : Operand<i16> { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; +} + class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> { let PrintMethod = "print"#Name; let ParserMatchClass = MatchClass; } +class NamedOperandU32Default0<string Name, AsmOperandClass MatchClass> : + OperandWithDefaultOps<i32, (ops (i32 0))> { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; +} + let OperandType = "OPERAND_IMMEDIATE" in { def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>; def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>; def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>; +def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>; +def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>; def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>; def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>; def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>; @@ -474,6 +567,9 @@ def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>; def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>; def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>; +def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>; +def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>; + def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>; @@ -486,6 +582,11 @@ def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>; def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>; def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>; +def op_sel : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>; +def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>; +def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>; +def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>; + def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>; def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> { @@ -525,6 +626,7 @@ class FPInputModsMatchClass <int opSize> : AsmOperandClass { let ParserMethod = "parseRegOrImmWithFPInputMods"; let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods"; } + def FP16InputModsMatchClass : FPInputModsMatchClass<16>; def FP32InputModsMatchClass : FPInputModsMatchClass<32>; def FP64InputModsMatchClass : FPInputModsMatchClass<64>; @@ -557,6 +659,16 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> def Int32InputMods : IntInputMods<Int32InputModsMatchClass>; def Int64InputMods : IntInputMods<Int64InputModsMatchClass>; +def FPRegSDWAInputModsMatchClass : AsmOperandClass { + let Name = "SDWARegWithFPInputMods"; + let ParserMethod = "parseRegWithFPInputMods"; + let PredicateMethod = "isSDWARegKind"; +} + +def FPRegSDWAInputMods : InputMods <FPRegSDWAInputModsMatchClass> { + let PrintMethod = "printOperandAndFPInputMods"; +} + def FPVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithFPInputMods"; let ParserMethod = "parseRegWithFPInputMods"; @@ -567,6 +679,17 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> { let PrintMethod = "printOperandAndFPInputMods"; } + +def IntRegSDWAInputModsMatchClass : AsmOperandClass { + let Name = "SDWARegWithIntInputMods"; + let ParserMethod = "parseRegWithIntInputMods"; + let PredicateMethod = "isSDWARegKind"; +} + +def IntRegSDWAInputMods : InputMods <IntRegSDWAInputModsMatchClass> { + let PrintMethod = "printOperandAndIntInputMods"; +} + def IntVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithIntInputMods"; let ParserMethod = "parseRegWithIntInputMods"; @@ -577,6 +700,33 @@ def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> { let PrintMethod = "printOperandAndIntInputMods"; } +class PackedFPInputModsMatchClass <int opSize> : AsmOperandClass { + let Name = "PackedFP"#opSize#"InputMods"; + let ParserMethod = "parseRegOrImm"; + let PredicateMethod = "isRegOrImm"; +// let PredicateMethod = "isPackedFP"#opSize#"InputMods"; +} + +class PackedIntInputModsMatchClass <int opSize> : AsmOperandClass { + let Name = "PackedInt"#opSize#"InputMods"; + let ParserMethod = "parseRegOrImm"; + let PredicateMethod = "isRegOrImm"; +// let PredicateMethod = "isPackedInt"#opSize#"InputMods"; +} + +def PackedF16InputModsMatchClass : PackedFPInputModsMatchClass<16>; +def PackedI16InputModsMatchClass : PackedIntInputModsMatchClass<16>; + +class PackedFPInputMods <PackedFPInputModsMatchClass matchClass> : InputMods <matchClass> { +// let PrintMethod = "printPackedFPInputMods"; +} + +class PackedIntInputMods <PackedIntInputModsMatchClass matchClass> : InputMods <matchClass> { + //let PrintMethod = "printPackedIntInputMods"; +} + +def PackedF16InputMods : PackedFPInputMods<PackedF16InputModsMatchClass>; +def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>; //===----------------------------------------------------------------------===// // Complex patterns @@ -588,11 +738,18 @@ def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">; def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; -def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">; def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">; def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; -def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">; +def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">; +// VOP3Mods, but the input source is known to never be NaN. +def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">; + +def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; + +def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; +def VOP3PMods0 : ComplexPattern<untyped, 3, "SelectVOP3PMods0">; + //===----------------------------------------------------------------------===// // SI assembler operands @@ -604,19 +761,32 @@ def SIOperand { int FLAT_SCR = 0x68; } +// This should be kept in sync with SISrcMods enum def SRCMODS { int NONE = 0; int NEG = 1; + int ABS = 2; + int NEG_ABS = 3; + + int NEG_HI = ABS; + int OP_SEL_0 = 4; + int OP_SEL_1 = 8; } def DSTCLAMP { int NONE = 0; + int ENABLE = 1; } def DSTOMOD { int NONE = 0; } +def TRAPID{ + int LLVM_TRAP = 2; + int LLVM_DEBUG_TRAP = 3; +} + //===----------------------------------------------------------------------===// // // SI Instruction multiclass helpers. @@ -648,8 +818,9 @@ class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon< ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3, exp_vm:$vm, exp_compr:$compr, i8imm:$en), "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm", - [(node (i8 timm:$en), (i1 timm:$vm), (i8 timm:$tgt), (i1 timm:$compr), - f32:$src0, f32:$src1, f32:$src2, f32:$src3)]> { + [(node (i8 timm:$tgt), (i8 timm:$en), + f32:$src0, f32:$src1, f32:$src2, f32:$src3, + (i1 timm:$compr), (i1 timm:$vm))]> { let AsmMatchConverter = "cvtExp"; } @@ -666,6 +837,7 @@ multiclass EXP_m<bit done, SDPatternOperator node> { def _si : EXP_Helper<done>, SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>, EXPe { + let AssemblerPredicates = [isSICI]; let DecoderNamespace = "SICI"; let DisableDecoder = DisableSIDecoder; } @@ -673,6 +845,7 @@ multiclass EXP_m<bit done, SDPatternOperator node> { def _vi : EXP_Helper<done>, SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>, EXPe_vi { + let AssemblerPredicates = [isVI]; let DecoderNamespace = "VI"; let DisableDecoder = DisableVIDecoder; } @@ -702,16 +875,46 @@ class getVALUDstForVT<ValueType VT> { VOPDstOperand<SReg_64>)))); // else VT == i1 } +// Returns the register class to use for the destination of VOP[12C] +// instructions with SDWA extension +class getSDWADstForVT<ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 1), + SDWAVopcDst, // VOPC + VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst +} + // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT> { bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, v2f16.Value), 1, !if(!eq(VT.Value, f32.Value), 1, !if(!eq(VT.Value, f64.Value), 1, - 0))); - RegisterOperand ret = !if(isFP, - !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)), - !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32))); + 0)))); + + RegisterOperand ret = + !if(isFP, + !if(!eq(VT.Size, 64), + VSrc_f64, + !if(!eq(VT.Value, f16.Value), + VSrc_f16, + !if(!eq(VT.Value, v2f16.Value), + VCSrc_v2f16, + VSrc_f32 + ) + ) + ), + !if(!eq(VT.Size, 64), + VSrc_b64, + !if(!eq(VT.Value, i16.Value), + VSrc_b16, + !if(!eq(VT.Value, v2i16.Value), + VCSrc_v2b16, + VSrc_b32 + ) + ) + ) + ); } // Returns the vreg register class to use for source operand given VT @@ -720,30 +923,46 @@ class getVregSrcForVT<ValueType VT> { !if(!eq(VT.Size, 64), VReg_64, VGPR_32)); } +class getSDWASrcForVT <ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32); +} // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT<ValueType VT> { bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, v2f16.Value), 1, !if(!eq(VT.Value, f32.Value), 1, !if(!eq(VT.Value, f64.Value), 1, - 0))); + 0)))); RegisterOperand ret = !if(!eq(VT.Size, 128), - VSrc_128, - !if(!eq(VT.Size, 64), + VSrc_128, + !if(!eq(VT.Size, 64), !if(isFP, - VCSrc_f64, - VCSrc_b64), + VCSrc_f64, + VCSrc_b64), !if(!eq(VT.Value, i1.Value), - SCSrc_b64, - !if(isFP, - !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32), - !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32) - ) - ) - ) - ); + SCSrc_b64, + !if(isFP, + !if(!eq(VT.Value, f16.Value), + VCSrc_f16, + !if(!eq(VT.Value, v2f16.Value), + VCSrc_v2f16, + VCSrc_f32 + ) + ), + !if(!eq(VT.Value, i16.Value), + VCSrc_b16, + !if(!eq(VT.Value, v2i16.Value), + VCSrc_v2b16, + VCSrc_b32 + ) + ) + ) + ) + ) + ); } // Returns 1 if the source arguments have modifiers, 0 if they do not. @@ -753,7 +972,8 @@ class isFloatType<ValueType SrcVT> { !if(!eq(SrcVT.Value, f16.Value), 1, !if(!eq(SrcVT.Value, f32.Value), 1, !if(!eq(SrcVT.Value, f64.Value), 1, - 0))); + !if(!eq(SrcVT.Value, v2f16.Value), 1, + 0)))); } class isIntType<ValueType SrcVT> { @@ -764,6 +984,23 @@ class isIntType<ValueType SrcVT> { 0))); } +class isPackedType<ValueType SrcVT> { + bit ret = + !if(!eq(SrcVT.Value, v2i16.Value), 1, + !if(!eq(SrcVT.Value, v2f16.Value), 1, 0) + ); +} + +// Float or packed int +class isModifierType<ValueType SrcVT> { + bit ret = + !if(!eq(SrcVT.Value, f16.Value), 1, + !if(!eq(SrcVT.Value, f32.Value), 1, + !if(!eq(SrcVT.Value, f64.Value), 1, + !if(!eq(SrcVT.Value, v2f16.Value), 1, + !if(!eq(SrcVT.Value, v2i16.Value), 1, + 0))))); +} // Return type of input modifiers operand for specified input operand class getSrcMod <ValueType VT> { @@ -771,6 +1008,7 @@ class getSrcMod <ValueType VT> { !if(!eq(VT.Value, f32.Value), 1, !if(!eq(VT.Value, f64.Value), 1, 0))); + bit isPacked = isPackedType<VT>.ret; Operand ret = !if(!eq(VT.Size, 64), !if(isFP, FP64InputMods, Int64InputMods), !if(isFP, @@ -782,7 +1020,7 @@ class getSrcMod <ValueType VT> { ); } -// Return type of input modifiers operand specified input operand for SDWA/DPP +// Return type of input modifiers operand specified input operand for DPP class getSrcModExt <ValueType VT> { bit isFP = !if(!eq(VT.Value, f16.Value), 1, !if(!eq(VT.Value, f32.Value), 1, @@ -791,6 +1029,15 @@ class getSrcModExt <ValueType VT> { Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); } +// Return type of input modifiers operand specified input operand for SDWA +class getSrcModSDWA <ValueType VT> { + bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, f32.Value), 1, + !if(!eq(VT.Value, f64.Value), 1, + 0))); + Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods); +} + // Returns the input arguments for VOP[12C] instructions for the given SrcVT. class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 @@ -801,8 +1048,8 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { // Returns the input arguments for VOP3 instructions for the given SrcVT. class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, RegisterOperand Src2RC, int NumSrcArgs, - bit HasModifiers, Operand Src0Mod, Operand Src1Mod, - Operand Src2Mod> { + bit HasModifiers, bit HasOMod, + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { dag ret = !if (!eq(NumSrcArgs, 0), @@ -821,9 +1068,13 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, !if (!eq(NumSrcArgs, 2), !if (!eq(HasModifiers, 1), // VOP 2 with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - clampmod:$clamp, omod:$omod) + !if( !eq(HasOMod, 1), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, omod:$omod), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp)) /* else */, // VOP2 without modifiers (ins Src0RC:$src0, Src1RC:$src1) @@ -831,16 +1082,57 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, /* NumSrcArgs == 3 */, !if (!eq(HasModifiers, 1), // VOP3 with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Src2Mod:$src2_modifiers, Src2RC:$src2, - clampmod:$clamp, omod:$omod) + !if (!eq(HasOMod, 1), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp, omod:$omod), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp)) /* else */, // VOP3 without modifiers (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2) /* endif */ )))); } +/// XXX - src1 may only allow VGPRs? + +// The modifiers (except clamp) are dummy operands for the benefit of +// printing and parsing. They defer their values to looking at the +// srcN_modifiers for what to print. +class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC, + RegisterOperand Src2RC, int NumSrcArgs, + bit HasClamp, + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { + dag ret = !if (!eq(NumSrcArgs, 2), + !if (HasClamp, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, + op_sel:$op_sel, op_sel_hi:$op_sel_hi, + neg_lo:$neg_lo, neg_hi:$neg_hi), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + op_sel:$op_sel, op_sel_hi:$op_sel_hi, + neg_lo:$neg_lo, neg_hi:$neg_hi)), + // else NumSrcArgs == 3 + !if (HasClamp, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp, + op_sel:$op_sel, op_sel_hi:$op_sel_hi, + neg_lo:$neg_lo, neg_hi:$neg_hi), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + op_sel:$op_sel, op_sel_hi:$op_sel_hi, + neg_lo:$neg_lo, neg_hi:$neg_hi)) + ); +} + class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, bit HasModifiers, Operand Src0Mod, Operand Src1Mod> { @@ -874,37 +1166,67 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, /* endif */))); } -class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, - bit HasFloatModifiers, Operand Src0Mod, Operand Src1Mod, + + +// Ins for SDWA +class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs, + bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod, ValueType DstVT> { dag ret = !if(!eq(NumSrcArgs, 0), // VOP1 without input operands (V_NOP) (ins), !if(!eq(NumSrcArgs, 1), - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel), + // VOP1 + !if(!eq(HasSDWAOMod, 0), + // VOP1_SDWA without omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + clampmod:$clamp, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel), + // VOP1_SDWA with omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel)), !if(!eq(NumSrcArgs, 2), !if(!eq(DstVT.Size, 1), - // VOPC_SDWA with modifiers + // VOPC_SDWA (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), - // VOP2_SDWA or VOPC_SDWA with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, - src0_sel:$src0_sel, src1_sel:$src1_sel)), + // VOP2_SDWA + !if(!eq(HasSDWAOMod, 0), + // VOP2_SDWA without omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP2_SDWA with omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel))), (ins)/* endif */))); } // Outs for DPP and SDWA -class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> { +class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> { dag ret = !if(HasDst, !if(!eq(DstVT.Size, 1), (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions - (outs DstRCDPP:$vdst)), + (outs DstRCExt:$vdst)), + (outs)); // V_NOP +} + +// Outs for SDWA +class getOutsSDWA <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA> { + dag ret = !if(HasDst, + !if(!eq(DstVT.Size, 1), + (outs DstRCSDWA:$sdst), + (outs DstRCSDWA:$vdst)), (outs)); // V_NOP } @@ -924,7 +1246,8 @@ class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> { // Returns the assembly string for the inputs and outputs of a VOP3 // instruction. -class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> { +class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, + bit HasOMod, ValueType DstVT = i32> { string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); string src1 = !if(!eq(NumSrcArgs, 1), "", @@ -934,7 +1257,26 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = string ret = !if(!eq(HasModifiers, 0), getAsm32<HasDst, NumSrcArgs, DstVT>.ret, - dst#", "#src0#src1#src2#"$clamp"#"$omod"); + dst#", "#src0#src1#src2#"$clamp"#!if(HasOMod, "$omod", "")); +} + +// Returns the assembly string for the inputs and outputs of a VOP3P +// instruction. +class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers, + bit HasClamp, ValueType DstVT = i32> { + string dst = " $vdst"; + string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1", + " $src1,")); + string src2 = !if(!eq(NumSrcArgs, 3), " $src2", ""); + + string mods = !if(HasModifiers, "$neg_lo$neg_hi", ""); + string clamp = !if(HasClamp, "$clamp", ""); + + // Each modifier is printed as an array of bits for each operand, so + // all operands are printed as part of src0_modifiers. + string ret = dst#", "#src0#src1#src2#"$op_sel$op_sel_hi"#mods#clamp; } class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> { @@ -953,8 +1295,7 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; } -class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers, - ValueType DstVT = i32> { +class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> { string dst = !if(HasDst, !if(!eq(DstVT.Size, 1), " vcc", // use vcc token as dst for VOPC instructioins @@ -982,6 +1323,35 @@ class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers, string ret = dst#args#sdwa; } +class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs, + ValueType DstVT = i32> { + string dst = !if(HasDst, + !if(!eq(DstVT.Size, 1), + "$sdst", // VOPC + "$vdst"), // VOP1/2 + ""); + string src0 = "$src0_modifiers"; + string src1 = "$src1_modifiers"; + string out_mods = !if(!eq(HasOMod, 0), "$clamp", "$clamp$omod"); + string args = !if(!eq(NumSrcArgs, 0), "", + !if(!eq(NumSrcArgs, 1), + ", "#src0, + ", "#src0#", "#src1 + ) + ); + string sdwa = !if(!eq(NumSrcArgs, 0), "", + !if(!eq(NumSrcArgs, 1), + out_mods#" $dst_sel $dst_unused $src0_sel", + !if(!eq(DstVT.Size, 1), + " $src0_sel $src1_sel", // No dst_sel, dst_unused and output modifiers for VOPC + out_mods#" $dst_sel $dst_unused $src0_sel $src1_sel" + ) + ) + ); + string ret = dst#args#sdwa; +} + + // Function that checks if instruction supports DPP and SDWA class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, ValueType Src1VT = i32> { @@ -1018,7 +1388,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field ValueType Src2VT = ArgVT[3]; field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret; field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret; - field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret; + field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret; field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret; field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; @@ -1026,16 +1396,16 @@ class VOPProfile <list<ValueType> _ArgVT> { field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret; field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret; - field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret; - field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret; + field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret; + field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret; field Operand Src0Mod = getSrcMod<Src0VT>.ret; field Operand Src1Mod = getSrcMod<Src1VT>.ret; field Operand Src2Mod = getSrcMod<Src2VT>.ret; field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret; field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret; - field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret; - field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret; - + field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret; + field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret; + field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); field bit HasDst32 = HasDst; @@ -1046,7 +1416,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1); // TODO: Modifiers logic is somewhat adhoc here, to be refined later - field bit HasModifiers = isFloatType<Src0VT>.ret; + field bit HasModifiers = isModifierType<Src0VT>.ret; field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret; field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret; @@ -1060,11 +1430,21 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0); field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0); - field bit HasOMod = HasModifiers; field bit HasClamp = HasModifiers; - field bit HasSDWAClamp = HasSrc0; + field bit HasSDWAClamp = EmitDst; + field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret; + + field bit IsPacked = isPackedType<Src0VT>.ret; + field bit HasOpSel = IsPacked; + field bit HasOMod = !if(HasOpSel, 0, isFloatType<DstVT>.ret); + field bit HasSDWAOMod = isFloatType<DstVT>.ret; field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; + field bit HasSDWA9 = HasExt; + + field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); + field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); + field Operand Src2PackedMod = !if(HasSrc2FloatMods, PackedF16InputMods, PackedI16InputMods); field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs)); @@ -1073,25 +1453,34 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag Outs32 = Outs; field dag Outs64 = Outs; field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret; - field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret; + field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret; field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, - HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret; + HasModifiers, HasOMod, Src0Mod, Src1Mod, + Src2Mod>.ret; + field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64, + NumSrcArgs, HasClamp, + Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret; + field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP>.ret; field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, - HasModifiers, Src0ModSDWA, Src1ModSDWA, + HasSDWAOMod, Src0ModSDWA, Src1ModSDWA, DstVT>.ret; + field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret; - field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; + field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret; + field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret; field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; - field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; + field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret; + field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret; } class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { let HasExt = 0; + let HasSDWA9 = 0; } def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; @@ -1101,11 +1490,20 @@ def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>; -def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>; -def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>; +def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; +def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>; +def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>; +def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>; + +def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>; +def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>; + +def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>; + def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>; def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; @@ -1117,6 +1515,8 @@ def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>; def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; +def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>; +def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>; def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>; def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; @@ -1126,6 +1526,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; @@ -1213,6 +1614,24 @@ def getVOPe32 : InstrMapping { let ValueCols = [["4", "0"]]; } +// Maps ordinary instructions to their SDWA counterparts +def getSDWAOp : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["Default"]; + let ValueCols = [["SDWA"]]; +} + +// Maps SDWA instructions to their ordinary counterparts +def getBasicFromSDWAOp : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["SDWA"]; + let ValueCols = [["Default"]]; +} + def getMaskedMIMGOp : InstrMapping { let FilterClass = "MIMG_Mask"; let RowFields = ["Op"]; @@ -1245,7 +1664,9 @@ def getMCOpcodeGen : InstrMapping { let ColFields = ["Subtarget"]; let KeyCol = [!cast<string>(SIEncodingFamily.NONE)]; let ValueCols = [[!cast<string>(SIEncodingFamily.SI)], - [!cast<string>(SIEncodingFamily.VI)]]; + [!cast<string>(SIEncodingFamily.VI)], + [!cast<string>(SIEncodingFamily.SDWA)], + [!cast<string>(SIEncodingFamily.SDWA9)]]; } // Get equivalent SOPK instruction. diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td index 38e31e7..ba69e42 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -94,6 +94,12 @@ defm V_INTERP_MOV_F32 : VINTRP_m < //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// +def ATOMIC_FENCE : SPseudoInstSI< + (outs), (ins i32imm:$ordering, i32imm:$scope), + [(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))], + "ATOMIC_FENCE $ordering, $scope"> { + let hasSideEffects = 1; +} let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { @@ -146,6 +152,8 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), let mayStore = 1; let isBarrier = 1; let isConvergent = 1; + let FixedSize = 1; + let Size = 0; } // SI pseudo instructions. These are used by the CFG structurizer pass @@ -153,48 +161,51 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), // Dummy terminator instruction to use after control flow instructions // replaced with exec mask operations. -def SI_MASK_BRANCH : PseudoInstSI < +def SI_MASK_BRANCH : VPseudoInstSI < (outs), (ins brtarget:$target)> { let isBranch = 0; let isTerminator = 1; let isBarrier = 0; - let Uses = [EXEC]; let SchedRW = []; let hasNoSchedulingInfo = 1; + let FixedSize = 1; + let Size = 0; } let isTerminator = 1 in { + def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < + (outs), + (ins SReg_64:$vcc, brtarget:$target), + [(brcond i1:$vcc, bb:$target)]> { + let Size = 12; +} + def SI_IF: CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), - [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> { + [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { let Constraints = ""; let Size = 12; - let mayLoad = 1; - let mayStore = 1; let hasSideEffects = 1; } def SI_ELSE : CFPseudoInstSI < - (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { + (outs SReg_64:$dst), + (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { let Constraints = "$src = $dst"; let Size = 12; - let mayStore = 1; - let mayLoad = 1; let hasSideEffects = 1; } def SI_LOOP : CFPseudoInstSI < (outs), (ins SReg_64:$saved, brtarget:$target), - [(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> { + [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> { let Size = 8; - let isBranch = 1; + let isBranch = 0; let hasSideEffects = 1; - let mayLoad = 1; - let mayStore = 1; } -} // End isBranch = 1, isTerminator = 1 +} // End isTerminator = 1 def SI_END_CF : CFPseudoInstSI < (outs), (ins SReg_64:$saved), @@ -202,9 +213,9 @@ def SI_END_CF : CFPseudoInstSI < let Size = 4; let isAsCheapAsAMove = 1; let isReMaterializable = 1; - let mayLoad = 1; - let mayStore = 1; let hasSideEffects = 1; + let mayLoad = 1; // FIXME: Should not need memory flags + let mayStore = 1; } def SI_BREAK : CFPseudoInstSI < @@ -244,6 +255,10 @@ def SI_KILL_TERMINATOR : SPseudoInstSI < let isTerminator = 1; } +def SI_ILLEGAL_COPY : SPseudoInstSI < + (outs unknown:$dst), (ins unknown:$src), + [], " ; illegal copy $src to $dst">; + } // End Uses = [EXEC], Defs = [EXEC,VCC] // Branch on undef scc. Used to avoid intermediate copy from @@ -259,6 +274,14 @@ def SI_PS_LIVE : PseudoInstSI < let SALU = 1; } +def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), + [(int_amdgcn_unreachable)], + "; divergent unreachable"> { + let Size = 0; + let hasNoSchedulingInfo = 1; + let FixedSize = 1; +} + // Used as an isel pseudo to directly emit initialization with an // s_mov_b32 rather than a copy of another initialized // register. MachineCSE skips copies, and we don't want to have to @@ -270,12 +293,25 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { let isReMaterializable = 1; } -def SI_RETURN : SPseudoInstSI < - (outs), (ins variable_ops), [(AMDGPUreturn)]> { +def SI_INIT_EXEC : SPseudoInstSI < + (outs), (ins i64imm:$src), []> { + let Defs = [EXEC]; + let usesCustomInserter = 1; + let isAsCheapAsAMove = 1; +} + +def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < + (outs), (ins SSrc_b32:$input, i32imm:$shift), []> { + let Defs = [EXEC]; + let usesCustomInserter = 1; +} + +// Return for returning shaders to a shader variant epilog. +def SI_RETURN_TO_EPILOG : SPseudoInstSI < + (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { let isTerminator = 1; let isBarrier = 1; let isReturn = 1; - let hasSideEffects = 1; let hasNoSchedulingInfo = 1; let DisableWQM = 1; } @@ -383,9 +419,23 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < } // End SubtargetPredicate = isGCN let Predicates = [isGCN] in { +def : Pat < + (AMDGPUinit_exec i64:$src), + (SI_INIT_EXEC (as_i64imm $src)) +>; + +def : Pat < + (AMDGPUinit_exec_from_input i32:$input, i32:$shift), + (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift)) +>; + +def : Pat< + (AMDGPUtrap timm:$trapid), + (S_TRAP $trapid) +>; def : Pat< - (int_amdgcn_else i64:$src, bb:$target), + (AMDGPUelse i64:$src, bb:$target), (SI_ELSE $src, $target, 0) >; @@ -423,24 +473,37 @@ def : Pat < } // End Predicates = [UnsafeFPMath] + +// f16_to_fp patterns def : Pat < - (f32 (fpextend f16:$src)), - (V_CVT_F32_F16_e32 $src) + (f32 (f16_to_fp i32:$src0)), + (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; def : Pat < - (f64 (fpextend f16:$src)), - (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) + (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), + (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; def : Pat < - (f16 (fpround f32:$src)), - (V_CVT_F16_F32_e32 $src) + (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), + (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; def : Pat < - (f16 (fpround f64:$src)), - (V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src)) + (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), + (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : Pat < + (f64 (fpextend f16:$src)), + (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) +>; + +// fp_to_fp16 patterns +def : Pat < + (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; def : Pat < @@ -469,17 +532,27 @@ def : Pat < multiclass FMADPat <ValueType vt, Instruction inst> { def : Pat < - (vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3NoMods vt:$src1, i32:$src1_modifiers), - (VOP3NoMods vt:$src2, i32:$src2_modifiers))), - (inst $src0_modifiers, $src0, $src1_modifiers, $src1, - $src2_modifiers, $src2, $clamp, $omod) + (vt (fmad (VOP3NoMods vt:$src0), + (VOP3NoMods vt:$src1), + (VOP3NoMods vt:$src2))), + (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, + SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) >; } defm : FMADPat <f16, V_MAC_F16_e64>; defm : FMADPat <f32, V_MAC_F32_e64>; +class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : Pat< + (f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod), + (VOP3Mods f32:$src1, i32:$src1_mod), + (VOP3Mods f32:$src2, i32:$src2_mod))), + (inst $src0_mod, $src0, $src1_mod, $src1, + $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>; + multiclass SelectPat <ValueType vt, Instruction inst> { def : Pat < (vt (select i1:$src0, vt:$src1, vt:$src2)), @@ -578,6 +651,16 @@ def : BitConvert <i32, f32, VGPR_32>; def : BitConvert <f32, i32, VGPR_32>; def : BitConvert <i32, f32, SReg_32>; def : BitConvert <f32, i32, SReg_32>; +def : BitConvert <v2i16, i32, SReg_32>; +def : BitConvert <i32, v2i16, SReg_32>; +def : BitConvert <v2f16, i32, SReg_32>; +def : BitConvert <i32, v2f16, SReg_32>; +def : BitConvert <v2i16, v2f16, SReg_32>; +def : BitConvert <v2f16, v2i16, SReg_32>; +def : BitConvert <v2f16, f32, SReg_32>; +def : BitConvert <f32, v2f16, SReg_32>; +def : BitConvert <v2i16, f32, SReg_32>; +def : BitConvert <f32, v2i16, SReg_32>; // 64-bit bitcast def : BitConvert <i64, f64, VReg_64>; @@ -619,12 +702,19 @@ def : BitConvert <v16f32, v16i32, VReg_512>; /********** Src & Dst modifiers **********/ /********** =================== **********/ -def : Pat < - (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), - (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod) + +// If denormals are not enabled, it only impacts the compare of the +// inputs. The output result is not flushed. +class ClampPat<Instruction inst, ValueType vt> : Pat < + (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))), + (inst i32:$src0_modifiers, vt:$src0, + i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE) >; +def : ClampPat<V_MAX_F32_e64, f32>; +def : ClampPat<V_MAX_F64, f64>; +def : ClampPat<V_MAX_F16_e64, f16>; + /********** ================================ **********/ /********** Floating point absolute/negative **********/ /********** ================================ **********/ @@ -678,6 +768,37 @@ def : Pat < >; def : Pat < + (fcopysign f16:$src0, f16:$src1), + (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) +>; + +def : Pat < + (fcopysign f32:$src0, f16:$src1), + (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0, + (V_LSHLREV_B32_e64 (i32 16), $src1)) +>; + +def : Pat < + (fcopysign f64:$src0, f16:$src1), + (REG_SEQUENCE SReg_64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, + (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), + (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1) +>; + +def : Pat < + (fcopysign f16:$src0, f32:$src1), + (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, + (V_LSHRREV_B32_e64 (i32 16), $src1)) +>; + +def : Pat < + (fcopysign f16:$src0, f64:$src1), + (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, + (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) +>; + +def : Pat < (fneg f16:$src), (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000))) >; @@ -692,6 +813,25 @@ def : Pat < (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit >; +def : Pat < + (fneg v2f16:$src), + (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src) +>; + +def : Pat < + (fabs v2f16:$src), + (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src) +>; + +// This is really (fneg (fabs v2f16:$src)) +// +// fabs is not reported as free because there is modifier for it in +// VOP3P instructions, so it is turned into the bit op. +def : Pat < + (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))), + (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit +>; + /********** ================== **********/ /********** Immediate Patterns **********/ /********** ================== **********/ @@ -759,27 +899,6 @@ def : Pat < def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; def : Pat < - (int_AMDGPU_cube v4f32:$src), - (REG_SEQUENCE VReg_128, - (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)), - 0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)), - 0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)), - 0 /* clamp */, 0 /* omod */), sub0, - (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), - 0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), - 0 /* clamp */, 0 /* omod */), sub1, - (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), - 0 /* clamp */, 0 /* omod */), sub2, - (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), - 0 /* clamp */, 0 /* omod */), sub3) ->; - -def : Pat < (i32 (sext i1:$src0)), (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) >; @@ -810,6 +929,14 @@ def : UMad24Pat<V_MAD_U32_U24>; defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; def : ROTRPattern <V_ALIGNBIT_B32>; +def : Pat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), + (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; + +def : Pat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), + (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; + /********** ====================== **********/ /********** Indirect addressing **********/ /********** ====================== **********/ @@ -933,7 +1060,7 @@ def : Pat < class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat < (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), - (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) + (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) >; def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; @@ -985,6 +1112,11 @@ def : Pat < //===----------------------------------------------------------------------===// // Miscellaneous Patterns //===----------------------------------------------------------------------===// +def : Pat < + (i32 (AMDGPUfp16_zext f16:$src)), + (COPY $src) +>; + def : Pat < (i32 (trunc i64:$a)), @@ -1028,24 +1160,72 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; // FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; +defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>; -def : BFEPattern <V_BFE_U32, S_MOV_B32>; +def : Pat< + (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), + (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0) +>; def : Pat< - (fcanonicalize f16:$src), - (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0) + (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), + (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0) >; def : Pat< - (fcanonicalize f32:$src), - (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0) + (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), + (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0) >; def : Pat< - (fcanonicalize f64:$src), - (V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0) + (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), + (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) +>; + + +// Allow integer inputs +class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat< + (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)), + (Inst i8:$tgt, vt:$src0, vt:$src1, vt:$src2, vt:$src3, i1:$vm, i1:$compr, i8:$en) >; +def : ExpPattern<AMDGPUexport, i32, EXP>; +def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>; + +def : Pat < + (v2i16 (build_vector i16:$src0, i16:$src1)), + (v2i16 (S_PACK_LL_B32_B16 $src0, $src1)) +>; + +// With multiple uses of the shift, this will duplicate the shift and +// increase register pressure. +def : Pat < + (v2i16 (build_vector i16:$src0, (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))), + (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1)) +>; + +def : Pat < + (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))), + (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))), + (v2i16 (S_PACK_HH_B32_B16 $src0, $src1)) +>; + +// TODO: Should source modifiers be matched to v_pack_b32_f16? +def : Pat < + (v2f16 (build_vector f16:$src0, f16:$src1)), + (v2f16 (S_PACK_LL_B32_B16 $src0, $src1)) +>; + +// def : Pat < +// (v2f16 (scalar_to_vector f16:$src0)), +// (COPY $src0) +// >; + +// def : Pat < +// (v2i16 (scalar_to_vector i16:$src0)), +// (COPY $src0) +// >; + //===----------------------------------------------------------------------===// // Fract Patterns //===----------------------------------------------------------------------===// @@ -1083,11 +1263,39 @@ def : Pat < // Miscellaneous Optimization Patterns //============================================================================// +// Undo sub x, c -> add x, -c canonicalization since c is more likely +// an inline immediate than -c. +// TODO: Also do for 64-bit. +def : Pat< + (add i32:$src0, (i32 NegSubInlineConst32:$src1)), + (S_SUB_I32 $src0, NegSubInlineConst32:$src1) +>; + def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>; def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>; def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>; +// This matches 16 permutations of +// max(min(x, y), min(max(x, y), z)) +class FPMed3Pat<ValueType vt, + Instruction med3Inst> : Pat< + (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), + (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), + (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), + (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : FPMed3Pat<f32, V_MED3_F32>; + +let Predicates = [isGFX9] in { +def : FPMed3Pat<f16, V_MED3_F16>; +def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>; +def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>; +} // End Predicates = [isGFX9] + //============================================================================// // Assembler aliases //============================================================================// diff --git a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td index 5da3754..7b7cf16 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td @@ -14,23 +14,7 @@ let TargetPrefix = "SI", isTarget = 1 in { - def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - - def int_SI_export : Intrinsic <[], - [llvm_i32_ty, // en - llvm_i32_ty, // vm (FIXME: should be i1) - llvm_i32_ty, // done (FIXME: should be i1) - llvm_i32_ty, // tgt - llvm_i32_ty, // compr (FIXME: should be i1) - llvm_float_ty, // src0 - llvm_float_ty, // src1 - llvm_float_ty, // src2 - llvm_float_ty], // src3 - [] - >; - def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed def int_SI_tbuffer_store : Intrinsic < @@ -64,146 +48,4 @@ let TargetPrefix = "SI", isTarget = 1 in { llvm_i32_ty], // tfe(imm) [IntrReadMem, IntrArgMemOnly]>; - def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>; - - // Fully-flexible SAMPLE instruction. - class SampleRaw : Intrinsic < - [llvm_v4f32_ty], // vdata(VGPR) - [llvm_anyint_ty, // vaddr(VGPR) - llvm_v8i32_ty, // rsrc(SGPR) - llvm_v4i32_ty, // sampler(SGPR) - llvm_i32_ty, // dmask(imm) - llvm_i32_ty, // unorm(imm) - llvm_i32_ty, // r128(imm) - llvm_i32_ty, // da(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty, // tfe(imm) - llvm_i32_ty], // lwe(imm) - [IntrNoMem]>; - - // Image instruction without a sampler. - class Image : Intrinsic < - [llvm_v4f32_ty], // vdata(VGPR) - [llvm_anyint_ty, // vaddr(VGPR) - llvm_v8i32_ty, // rsrc(SGPR) - llvm_i32_ty, // dmask(imm) - llvm_i32_ty, // unorm(imm) - llvm_i32_ty, // r128(imm) - llvm_i32_ty, // da(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty, // tfe(imm) - llvm_i32_ty], // lwe(imm) - [IntrNoMem]>; - - // Basic sample - def int_SI_image_sample : SampleRaw; - def int_SI_image_sample_cl : SampleRaw; - def int_SI_image_sample_d : SampleRaw; - def int_SI_image_sample_d_cl : SampleRaw; - def int_SI_image_sample_l : SampleRaw; - def int_SI_image_sample_b : SampleRaw; - def int_SI_image_sample_b_cl : SampleRaw; - def int_SI_image_sample_lz : SampleRaw; - def int_SI_image_sample_cd : SampleRaw; - def int_SI_image_sample_cd_cl : SampleRaw; - - // Sample with comparison - def int_SI_image_sample_c : SampleRaw; - def int_SI_image_sample_c_cl : SampleRaw; - def int_SI_image_sample_c_d : SampleRaw; - def int_SI_image_sample_c_d_cl : SampleRaw; - def int_SI_image_sample_c_l : SampleRaw; - def int_SI_image_sample_c_b : SampleRaw; - def int_SI_image_sample_c_b_cl : SampleRaw; - def int_SI_image_sample_c_lz : SampleRaw; - def int_SI_image_sample_c_cd : SampleRaw; - def int_SI_image_sample_c_cd_cl : SampleRaw; - - // Sample with offsets - def int_SI_image_sample_o : SampleRaw; - def int_SI_image_sample_cl_o : SampleRaw; - def int_SI_image_sample_d_o : SampleRaw; - def int_SI_image_sample_d_cl_o : SampleRaw; - def int_SI_image_sample_l_o : SampleRaw; - def int_SI_image_sample_b_o : SampleRaw; - def int_SI_image_sample_b_cl_o : SampleRaw; - def int_SI_image_sample_lz_o : SampleRaw; - def int_SI_image_sample_cd_o : SampleRaw; - def int_SI_image_sample_cd_cl_o : SampleRaw; - - // Sample with comparison and offsets - def int_SI_image_sample_c_o : SampleRaw; - def int_SI_image_sample_c_cl_o : SampleRaw; - def int_SI_image_sample_c_d_o : SampleRaw; - def int_SI_image_sample_c_d_cl_o : SampleRaw; - def int_SI_image_sample_c_l_o : SampleRaw; - def int_SI_image_sample_c_b_o : SampleRaw; - def int_SI_image_sample_c_b_cl_o : SampleRaw; - def int_SI_image_sample_c_lz_o : SampleRaw; - def int_SI_image_sample_c_cd_o : SampleRaw; - def int_SI_image_sample_c_cd_cl_o : SampleRaw; - - // Basic gather4 - def int_SI_gather4 : SampleRaw; - def int_SI_gather4_cl : SampleRaw; - def int_SI_gather4_l : SampleRaw; - def int_SI_gather4_b : SampleRaw; - def int_SI_gather4_b_cl : SampleRaw; - def int_SI_gather4_lz : SampleRaw; - - // Gather4 with comparison - def int_SI_gather4_c : SampleRaw; - def int_SI_gather4_c_cl : SampleRaw; - def int_SI_gather4_c_l : SampleRaw; - def int_SI_gather4_c_b : SampleRaw; - def int_SI_gather4_c_b_cl : SampleRaw; - def int_SI_gather4_c_lz : SampleRaw; - - // Gather4 with offsets - def int_SI_gather4_o : SampleRaw; - def int_SI_gather4_cl_o : SampleRaw; - def int_SI_gather4_l_o : SampleRaw; - def int_SI_gather4_b_o : SampleRaw; - def int_SI_gather4_b_cl_o : SampleRaw; - def int_SI_gather4_lz_o : SampleRaw; - - // Gather4 with comparison and offsets - def int_SI_gather4_c_o : SampleRaw; - def int_SI_gather4_c_cl_o : SampleRaw; - def int_SI_gather4_c_l_o : SampleRaw; - def int_SI_gather4_c_b_o : SampleRaw; - def int_SI_gather4_c_b_cl_o : SampleRaw; - def int_SI_gather4_c_lz_o : SampleRaw; - - def int_SI_getlod : SampleRaw; - - // Image instrinsics. - def int_SI_image_load : Image; - def int_SI_image_load_mip : Image; - def int_SI_getresinfo : Image; - - /* Interpolation Intrinsics */ - - def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>; } // End TargetPrefix = "SI", isTarget = 1 - -let TargetPrefix = "amdgcn", isTarget = 1 in { - // Emit 2.5 ulp, no denormal division. Should only be inserted by - // pass based on !fpmath metadata. - def int_amdgcn_fdiv_fast : Intrinsic< - [llvm_float_ty], [llvm_float_ty], [IntrNoMem] - >; - - /* Control flow Intrinsics */ - - def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], [IntrConvergent]>; - def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>; - def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent]>; - def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>; - def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>; - def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>; - def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>; -} diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 99fe96c..c6ad61a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -39,15 +39,27 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/LiveVariables.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include <cassert> +#include <iterator> +#include <utility> using namespace llvm; @@ -56,41 +68,36 @@ using namespace llvm; namespace { class SILoadStoreOptimizer : public MachineFunctionPass { + + typedef struct { + MachineBasicBlock::iterator I; + MachineBasicBlock::iterator Paired; + unsigned EltSize; + unsigned Offset0; + unsigned Offset1; + unsigned BaseOff; + bool UseST64; + SmallVector<MachineInstr*, 8> InstsToMove; + } CombineInfo; + private: - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; - MachineRegisterInfo *MRI; - AliasAnalysis *AA; - - static bool offsetsCanBeCombined(unsigned Offset0, - unsigned Offset1, - unsigned EltSize); - - MachineBasicBlock::iterator findMatchingDSInst( - MachineBasicBlock::iterator I, - unsigned EltSize, - SmallVectorImpl<MachineInstr*> &InstsToMove); - - MachineBasicBlock::iterator mergeRead2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize, - ArrayRef<MachineInstr*> InstsToMove); - - MachineBasicBlock::iterator mergeWrite2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize, - ArrayRef<MachineInstr*> InstsToMove); + const SIInstrInfo *TII = nullptr; + const SIRegisterInfo *TRI = nullptr; + MachineRegisterInfo *MRI = nullptr; + AliasAnalysis *AA = nullptr; + + static bool offsetsCanBeCombined(CombineInfo &CI); + + bool findMatchingDSInst(CombineInfo &CI); + + MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); + + MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); public: static char ID; - SILoadStoreOptimizer() - : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), - AA(nullptr) {} - - SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { + SILoadStoreOptimizer() : MachineFunctionPass(ID) { initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } @@ -108,7 +115,7 @@ public: } }; -} // End anonymous namespace. +} // end anonymous namespace. INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load / Store Optimizer", false, false) @@ -120,8 +127,8 @@ char SILoadStoreOptimizer::ID = 0; char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; -FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { - return new SILoadStoreOptimizer(TM); +FunctionPass *llvm::createSILoadStoreOptimizerPass() { + return new SILoadStoreOptimizer(); } static void moveInstsAfter(MachineBasicBlock::iterator I, @@ -141,11 +148,10 @@ static void addDefsToList(const MachineInstr &MI, } } -static bool memAccessesCanBeReordered( - MachineBasicBlock::iterator A, - MachineBasicBlock::iterator B, - const SIInstrInfo *TII, - llvm::AliasAnalysis * AA) { +static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, + MachineBasicBlock::iterator B, + const SIInstrInfo *TII, + AliasAnalysis * AA) { return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) || // RAW or WAR - cannot reorder // WAW - cannot reorder @@ -179,7 +185,6 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef<MachineInstr*> InstsToMove, const SIInstrInfo *TII, AliasAnalysis *AA) { - assert(MemOp.mayLoadOrStore()); for (MachineInstr *InstToMove : InstsToMove) { @@ -191,47 +196,68 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp, return true; } -bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, - unsigned Offset1, - unsigned Size) { +bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { // XXX - Would the same offset be OK? Is there any reason this would happen or // be useful? - if (Offset0 == Offset1) + if (CI.Offset0 == CI.Offset1) return false; // This won't be valid if the offset isn't aligned. - if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) + if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) return false; - unsigned EltOffset0 = Offset0 / Size; - unsigned EltOffset1 = Offset1 / Size; + unsigned EltOffset0 = CI.Offset0 / CI.EltSize; + unsigned EltOffset1 = CI.Offset1 / CI.EltSize; + CI.UseST64 = false; + CI.BaseOff = 0; + + // If the offset in elements doesn't fit in 8-bits, we might be able to use + // the stride 64 versions. + if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && + isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { + CI.Offset0 = EltOffset0 / 64; + CI.Offset1 = EltOffset1 / 64; + CI.UseST64 = true; + return true; + } // Check if the new offsets fit in the reduced 8-bit range. - if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) + if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { + CI.Offset0 = EltOffset0; + CI.Offset1 = EltOffset1; return true; + } - // If the offset in elements doesn't fit in 8-bits, we might be able to use - // the stride 64 versions. - if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) - return false; + // Try to shift base address to decrease offsets. + unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); + CI.BaseOff = std::min(CI.Offset0, CI.Offset1); - return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); + if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { + CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; + CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; + CI.UseST64 = true; + return true; + } + + if (isUInt<8>(OffsetDiff)) { + CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize; + CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize; + return true; + } + + return false; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize, - SmallVectorImpl<MachineInstr*> &InstsToMove) { - MachineBasicBlock::iterator E = I->getParent()->end(); - MachineBasicBlock::iterator MBBI = I; +bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) { + MachineBasicBlock::iterator E = CI.I->getParent()->end(); + MachineBasicBlock::iterator MBBI = CI.I; ++MBBI; SmallVector<const MachineOperand *, 8> DefsToMove; - addDefsToList(*I, DefsToMove); + addDefsToList(*CI.I, DefsToMove); for ( ; MBBI != E; ++MBBI) { - - if (MBBI->getOpcode() != I->getOpcode()) { + if (MBBI->getOpcode() != CI.I->getOpcode()) { // This is not a matching DS instruction, but we can keep looking as // long as one of these conditions are met: @@ -242,14 +268,14 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, if (MBBI->hasUnmodeledSideEffects()) // We can't re-order this instruction with respect to other memory // opeations, so we fail both conditions mentioned above. - return E; + return false; if (MBBI->mayLoadOrStore() && - !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) { + !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. - InstsToMove.push_back(&*MBBI); + CI.InstsToMove.push_back(&*MBBI); addDefsToList(*MBBI, DefsToMove); continue; } @@ -257,13 +283,13 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, // When we match I with another DS instruction we will be moving I down // to the location of the matched instruction any uses of I will need to // be moved down as well. - addToListsIfDependent(*MBBI, DefsToMove, InstsToMove); + addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove); continue; } // Don't merge volatiles. if (MBBI->hasOrderedMemoryRef()) - return E; + return false; // Handle a case like // DS_WRITE_B32 addr, v, idx0 @@ -271,77 +297,67 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, // DS_WRITE_B32 addr, f(w), idx1 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents // merging of the two writes. - if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove)) + if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove)) continue; - int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); - const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); + int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), + AMDGPU::OpName::addr); + const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx); const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); // Check same base pointer. Be careful of subregisters, which can occur with // vectors of pointers. if (AddrReg0.getReg() == AddrReg1.getReg() && AddrReg0.getSubReg() == AddrReg1.getSubReg()) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), + int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); - unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; - unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff; + CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + CI.Paired = MBBI; // Check both offsets fit in the reduced range. // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (offsetsCanBeCombined(Offset0, Offset1, EltSize) && - canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA)) - return MBBI; + if (offsetsCanBeCombined(CI)) + if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) + return true; } // We've found a load/store that we couldn't merge for some reason. // We could potentially keep looking, but we'd need to make sure that // it was safe to move I and also all the instruction in InstsToMove // down past this instruction. - if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) || // check if we can move I across MBBI - !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users - ) + // check if we can move I across MBBI and if we can move all I's users + if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) break; } - return E; + return false; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize, - ArrayRef<MachineInstr*> InstsToMove) { - MachineBasicBlock *MBB = I->getParent(); + CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird // cases, like vectors of pointers. - const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - - const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); - const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); - - unsigned Offset0 - = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; - unsigned Offset1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; - - unsigned NewOffset0 = Offset0 / EltSize; - unsigned NewOffset1 = Offset1 / EltSize; - unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; - - // Prefer the st64 form if we can use it, even if we can fit the offset in the - // non st64 version. I'm not sure if there's any real reason to do this. - bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); - if (UseST64) { - NewOffset0 /= 64; - NewOffset1 /= 64; - Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; - } + const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); + const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst); + + unsigned NewOffset0 = CI.Offset0; + unsigned NewOffset1 = CI.Offset1; + unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32 + : AMDGPU::DS_READ2_B64; + + if (CI.UseST64) + Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 + : AMDGPU::DS_READ2ST64_B64; - unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; - unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; + unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; + unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -356,72 +372,70 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( const MCInstrDesc &Read2Desc = TII->get(Opc); const TargetRegisterClass *SuperRC - = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); - DebugLoc DL = I->getDebugLoc(); - MachineInstrBuilder Read2 - = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg) - .addOperand(*AddrReg) // addr - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .addMemOperand(*I->memoperands_begin()) - .addMemOperand(*Paired->memoperands_begin()); + DebugLoc DL = CI.I->getDebugLoc(); + + unsigned BaseReg = AddrReg->getReg(); + unsigned BaseRegFlags = 0; + if (CI.BaseOff) { + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BaseRegFlags = RegState::Kill; + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) + .addImm(CI.BaseOff) + .addReg(AddrReg->getReg()); + } + + MachineInstrBuilder Read2 = + BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) + .addReg(BaseReg, BaseRegFlags) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + (void)Read2; const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); // Copy to the old destination registers. - BuildMI(*MBB, Paired, DL, CopyDesc) - .addOperand(*Dest0) // Copy to same destination including flags and sub reg. - .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc) - .addOperand(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); + BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); - moveInstsAfter(Copy1, InstsToMove); + moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(I); - I->eraseFromParent(); - Paired->eraseFromParent(); + MachineBasicBlock::iterator Next = std::next(CI.I); + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); return Next; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize, - ArrayRef<MachineInstr*> InstsToMove) { - MachineBasicBlock *MBB = I->getParent(); + CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); + const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); const MachineOperand *Data1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); + = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); + unsigned NewOffset0 = CI.Offset0; + unsigned NewOffset1 = CI.Offset1; + unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32 + : AMDGPU::DS_WRITE2_B64; - unsigned Offset0 - = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; - unsigned Offset1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; - - unsigned NewOffset0 = Offset0 / EltSize; - unsigned NewOffset1 = Offset1 / EltSize; - unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; - - // Prefer the st64 form if we can use it, even if we can fit the offset in the - // non st64 version. I'm not sure if there's any real reason to do this. - bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); - if (UseST64) { - NewOffset0 /= 64; - NewOffset1 /= 64; - Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; - } + if (CI.UseST64) + Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 + : AMDGPU::DS_WRITE2ST64_B64; if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -434,24 +448,33 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); - DebugLoc DL = I->getDebugLoc(); + DebugLoc DL = CI.I->getDebugLoc(); + + unsigned BaseReg = Addr->getReg(); + unsigned BaseRegFlags = 0; + if (CI.BaseOff) { + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BaseRegFlags = RegState::Kill; + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) + .addImm(CI.BaseOff) + .addReg(Addr->getReg()); + } - MachineInstrBuilder Write2 - = BuildMI(*MBB, Paired, DL, Write2Desc) - .addOperand(*Addr) // addr - .addOperand(*Data0) // data0 - .addOperand(*Data1) // data1 - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .addMemOperand(*I->memoperands_begin()) - .addMemOperand(*Paired->memoperands_begin()); + MachineInstrBuilder Write2 = + BuildMI(*MBB, CI.Paired, DL, Write2Desc) + .addReg(BaseReg, BaseRegFlags) // addr + .add(*Data0) // data0 + .add(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); - moveInstsAfter(Write2, InstsToMove); + moveInstsAfter(Write2, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(I); - I->eraseFromParent(); - Paired->eraseFromParent(); + MachineBasicBlock::iterator Next = std::next(CI.I); + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); return Next; @@ -472,27 +495,24 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } - SmallVector<MachineInstr*, 8> InstsToMove; + CombineInfo CI; + CI.I = I; unsigned Opc = MI.getOpcode(); if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { - unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, - InstsToMove); - if (Match != E) { + CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; + if (findMatchingDSInst(CI)) { Modified = true; - I = mergeRead2Pair(I, Match, Size, InstsToMove); + I = mergeRead2Pair(CI); } else { ++I; } continue; } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { - unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, - InstsToMove); - if (Match != E) { + CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; + if (findMatchingDSInst(CI)) { Modified = true; - I = mergeWrite2Pair(I, Match, Size, InstsToMove); + I = mergeWrite2Pair(CI); } else { ++I; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 7ed18f2..5f1c7f1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -51,13 +51,23 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/LivePhysRegs.h" -#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include <cassert> +#include <iterator> using namespace llvm; @@ -67,10 +77,10 @@ namespace { class SILowerControlFlow : public MachineFunctionPass { private: - const SIRegisterInfo *TRI; - const SIInstrInfo *TII; - LiveIntervals *LIS; - MachineRegisterInfo *MRI; + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + LiveIntervals *LIS = nullptr; + MachineRegisterInfo *MRI = nullptr; void emitIf(MachineInstr &MI); void emitElse(MachineInstr &MI); @@ -88,12 +98,7 @@ private: public: static char ID; - SILowerControlFlow() : - MachineFunctionPass(ID), - TRI(nullptr), - TII(nullptr), - LIS(nullptr), - MRI(nullptr) {} + SILowerControlFlow() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -113,7 +118,7 @@ public: } }; -} // End anonymous namespace +} // end anonymous namespace char SILowerControlFlow::ID = 0; @@ -175,9 +180,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // Insert a pseudo terminator to help keep the verifier happy. This will also // be used later when inserting skips. - MachineInstr *NewBr = - BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) - .addOperand(MI.getOperand(2)); + MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .add(MI.getOperand(2)); if (!LIS) { MI.eraseFromParent(); @@ -220,8 +224,9 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { // tied. In order to correctly tie the registers, split this into a copy of // the src like it does. unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); - BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg) - .addOperand(MI.getOperand(1)); // Saved EXEC + MachineInstr *CopyExec = + BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg) + .add(MI.getOperand(1)); // Saved EXEC // This must be inserted before phis and any spill code inserted before the // else. @@ -262,6 +267,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { LIS->RemoveMachineInstrFromMaps(MI); MI.eraseFromParent(); + LIS->InsertMachineInstrInMaps(*CopyExec); LIS->InsertMachineInstrInMaps(*OrSaveExec); LIS->InsertMachineInstrInMaps(*Xor); @@ -283,10 +289,9 @@ void SILowerControlFlow::emitBreak(MachineInstr &MI) { const DebugLoc &DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - MachineInstr *Or = - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(AMDGPU::EXEC) - .addOperand(MI.getOperand(1)); + MachineInstr *Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(AMDGPU::EXEC) + .add(MI.getOperand(1)); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *Or); @@ -306,13 +311,13 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *AndN2 = - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addOperand(MI.getOperand(0)); + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .add(MI.getOperand(0)); MachineInstr *Branch = - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addOperand(MI.getOperand(1)); + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .add(MI.getOperand(1)); if (LIS) { LIS->ReplaceMachineInstrInMaps(MI, *AndN2); @@ -328,9 +333,9 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock::iterator InsPt = MBB.begin(); MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addOperand(MI.getOperand(0)); + BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .add(MI.getOperand(0)); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *NewMI); diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index be2e14f..ba616ad 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -21,8 +21,8 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetMachine.h" @@ -114,18 +114,18 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { assert(Val == 0 || Val == -1); BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) - .addOperand(Dst) - .addImm(Val); + .add(Dst) + .addImm(Val); MI.eraseFromParent(); continue; } } BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) - .addOperand(Dst) - .addImm(0) - .addImm(-1) - .addOperand(Src); + .add(Dst) + .addImm(0) + .addImm(-1) + .add(Src); MI.eraseFromParent(); } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && SrcRC == &AMDGPU::VReg_1RegClass) { @@ -140,14 +140,14 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { MRI.getRegClass(DefInst->getOperand(3).getReg()), &AMDGPU::SGPR_64RegClass)) { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64)) - .addOperand(Dst) - .addReg(AMDGPU::EXEC) - .addOperand(DefInst->getOperand(3)); + .add(Dst) + .addReg(AMDGPU::EXEC) + .add(DefInst->getOperand(3)); } else { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64)) - .addOperand(Dst) - .addOperand(Src) - .addImm(0); + .add(Dst) + .add(Src) + .addImm(0); } MI.eraseFromParent(); } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index ecd46b9..a7c8166 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -20,17 +20,13 @@ using namespace llvm; -static cl::opt<bool> EnableSpillSGPRToVGPR( - "amdgpu-spill-sgpr-to-vgpr", - cl::desc("Enable spilling VGPRs to SGPRs"), - cl::ReallyHidden, - cl::init(true)); - SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), - ScratchRSrcReg(AMDGPU::NoRegister), - ScratchWaveOffsetReg(AMDGPU::NoRegister), + ScratchRSrcReg(AMDGPU::PRIVATE_RSRC_REG), + ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG), + FrameOffsetReg(AMDGPU::FP_REG), + StackPtrOffsetReg(AMDGPU::SP_REG), PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), DispatchPtrUserSGPR(AMDGPU::NoRegister), QueuePtrUserSGPR(AMDGPU::NoRegister), @@ -46,14 +42,17 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), + WorkItemIDXVGPR(AMDGPU::NoRegister), + WorkItemIDYVGPR(AMDGPU::NoRegister), + WorkItemIDZVGPR(AMDGPU::NoRegister), PSInputAddr(0), + PSInputEnable(0), ReturnsVoid(true), FlatWorkGroupSizes(0, 0), WavesPerEU(0, 0), DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}), DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}), LDSWaveSpillSize(0), - PSInputEna(0), NumUserSGPRs(0), NumSystemSGPRs(0), HasSpilledSGPRs(false), @@ -78,44 +77,83 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false), - PrivateMemoryInputPtr(false) { + ImplicitBufferPtr(false) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); const Function *F = MF.getFunction(); + FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); + WavesPerEU = ST.getWavesPerEU(*F); - PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); + if (!isEntryFunction()) { + // Non-entry functions have no special inputs for now, other registers + // required for scratch access. + ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; + ScratchWaveOffsetReg = AMDGPU::SGPR4; + FrameOffsetReg = AMDGPU::SGPR5; + StackPtrOffsetReg = AMDGPU::SGPR32; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + // FIXME: Not really a system SGPR. + PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg; + } - if (!AMDGPU::isShader(F->getCallingConv())) { - KernargSegmentPtr = true; + CallingConv::ID CC = F->getCallingConv(); + if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { + KernargSegmentPtr = !F->arg_empty(); WorkGroupIDX = true; WorkItemIDX = true; + } else if (CC == CallingConv::AMDGPU_PS) { + PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); } - if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue()) + if (ST.debuggerEmitPrologue()) { + // Enable everything. + WorkGroupIDX = true; WorkGroupIDY = true; - - if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue()) WorkGroupIDZ = true; - - if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue()) + WorkItemIDX = true; WorkItemIDY = true; - - if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue()) WorkItemIDZ = true; + } else { + if (F->hasFnAttribute("amdgpu-work-group-id-x")) + WorkGroupIDX = true; - // X, XY, and XYZ are the only supported combinations, so make sure Y is - // enabled if Z is. - if (WorkItemIDZ) - WorkItemIDY = true; + if (F->hasFnAttribute("amdgpu-work-group-id-y")) + WorkGroupIDY = true; + + if (F->hasFnAttribute("amdgpu-work-group-id-z")) + WorkGroupIDZ = true; + + if (F->hasFnAttribute("amdgpu-work-item-id-x")) + WorkItemIDX = true; + + if (F->hasFnAttribute("amdgpu-work-item-id-y")) + WorkItemIDY = true; + if (F->hasFnAttribute("amdgpu-work-item-id-z")) + WorkItemIDZ = true; + } + + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); bool MaySpill = ST.isVGPRSpillingEnabled(*F); bool HasStackObjects = FrameInfo.hasStackObjects(); - if (HasStackObjects || MaySpill) - PrivateSegmentWaveByteOffset = true; + if (isEntryFunction()) { + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; + + if (HasStackObjects || MaySpill) { + PrivateSegmentWaveByteOffset = true; - if (ST.isAmdCodeObjectV2(MF)) { + // HS and GS always have the scratch wave offset in SGPR5 on GFX9. + if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && + (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) + PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5; + } + } + + bool IsCOV2 = ST.isAmdCodeObjectV2(MF); + if (IsCOV2) { if (HasStackObjects || MaySpill) PrivateSegmentBuffer = true; @@ -129,18 +167,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) DispatchID = true; } else if (ST.isMesaGfxShader(MF)) { if (HasStackObjects || MaySpill) - PrivateMemoryInputPtr = true; + ImplicitBufferPtr = true; } - // We don't need to worry about accessing spills with flat instructions. - // TODO: On VI where we must use flat for global, we should be able to omit - // this if it is never used for generic access. - if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS && - ST.isAmdHsaOS()) - FlatScratchInit = true; + if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr")) + KernargSegmentPtr = true; - FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); - WavesPerEU = ST.getWavesPerEU(*F); + if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) { + // TODO: This could be refined a lot. The attribute is a poor way of + // detecting calls that may require it before argument lowering. + if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch")) + FlatScratchInit = true; + } } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( @@ -186,52 +224,67 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { return FlatScratchInitUserSGPR; } -unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) { - PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg( +unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { + ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); NumUserSGPRs += 2; - return PrivateMemoryPtrUserSGPR; + return ImplicitBufferPtrUserSGPR; } -SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg ( - MachineFunction *MF, - unsigned FrameIndex, - unsigned SubIdx) { - if (!EnableSpillSGPRToVGPR) - return SpilledReg(); - - const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - int64_t Offset = FrameInfo.getObjectOffset(FrameIndex); - Offset += SubIdx * 4; +/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. +bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, + int FI) { + std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI]; - unsigned LaneVGPRIdx = Offset / (64 * 4); - unsigned Lane = (Offset / 4) % 64; + // This has already been allocated. + if (!SpillLanes.empty()) + return true; - struct SpilledReg Spill; - Spill.Lane = Lane; - - if (!LaneVGPRs.count(LaneVGPRIdx)) { - unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, - *MF); - - if (LaneVGPR == AMDGPU::NoRegister) - // We have no VGPRs left for spilling SGPRs. - return Spill; - - LaneVGPRs[LaneVGPRIdx] = LaneVGPR; - - // Add this register as live-in to all blocks to avoid machine verifer - // complaining about use of an undefined physical register. - for (MachineFunction::iterator BI = MF->begin(), BE = MF->end(); - BI != BE; ++BI) { - BI->addLiveIn(LaneVGPR); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned WaveSize = ST.getWavefrontSize(); + + unsigned Size = FrameInfo.getObjectSize(FI); + assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size"); + assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs"); + + int NumLanes = Size / 4; + + // Make sure to handle the case where a wide SGPR spill may span between two + // VGPRs. + for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { + unsigned LaneVGPR; + unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize); + + if (VGPRIndex == 0) { + LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + if (LaneVGPR == AMDGPU::NoRegister) { + // We have no VGPRs left for spilling SGPRs. Reset because we won't + // partially spill the SGPR to VGPRs. + SGPRToVGPRSpills.erase(FI); + NumVGPRSpillLanes -= I; + return false; + } + + SpillVGPRs.push_back(LaneVGPR); + + // Add this register as live-in to all blocks to avoid machine verifer + // complaining about use of an undefined physical register. + for (MachineBasicBlock &BB : MF) + BB.addLiveIn(LaneVGPR); + } else { + LaneVGPR = SpillVGPRs.back(); } + + SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex)); } - Spill.VGPR = LaneVGPRs[LaneVGPRIdx]; - return Spill; + return true; +} + +void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) { + for (auto &R : SGPRToVGPRSpills) + MFI.RemoveStackObject(R.first); } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 6fc8d18..4c7f38a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -15,14 +15,18 @@ #define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H #include "AMDGPUMachineFunction.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/ErrorHandling.h" #include <array> +#include <cassert> #include <map> +#include <utility> namespace llvm { -class MachineRegisterInfo; - class AMDGPUImagePseudoSourceValue : public PseudoSourceValue { public: explicit AMDGPUImagePseudoSourceValue() : @@ -84,8 +88,16 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { unsigned ScratchRSrcReg; unsigned ScratchWaveOffsetReg; + // This is the current function's incremented size from the kernel's scratch + // wave offset register. For an entry function, this is exactly the same as + // the ScratchWaveOffsetReg. + unsigned FrameOffsetReg; + + // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg. + unsigned StackPtrOffsetReg; + // Input registers for non-HSA ABI - unsigned PrivateMemoryPtrUserSGPR; + unsigned ImplicitBufferPtrUserSGPR; // Input registers setup for the HSA ABI. // User SGPRs in allocation order. @@ -107,8 +119,15 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { unsigned WorkGroupInfoSystemSGPR; unsigned PrivateSegmentWaveByteOffsetSystemSGPR; + // VGPR inputs. These are always v0, v1 and v2 for entry functions. + unsigned WorkItemIDXVGPR; + unsigned WorkItemIDYVGPR; + unsigned WorkItemIDZVGPR; + // Graphics info. unsigned PSInputAddr; + unsigned PSInputEnable; + bool ReturnsVoid; // A pair of default/requested minimum/maximum flat work group sizes. @@ -127,16 +146,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { AMDGPUBufferPseudoSourceValue BufferPSV; AMDGPUImagePseudoSourceValue ImagePSV; -public: - // FIXME: Make private +private: unsigned LDSWaveSpillSize; - unsigned PSInputEna; - std::map<unsigned, unsigned> LaneVGPRs; unsigned ScratchOffsetReg; unsigned NumUserSGPRs; unsigned NumSystemSGPRs; -private: bool HasSpilledSGPRs; bool HasSpilledVGPRs; bool HasNonSpillStackObjects; @@ -169,7 +184,7 @@ private: // Private memory buffer // Compute directly in sgpr[0:1] // Other shaders indirect 64-bits at sgpr[0:1] - bool PrivateMemoryInputPtr : 1; + bool ImplicitBufferPtr : 1; MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); @@ -182,19 +197,39 @@ private: public: struct SpilledReg { - unsigned VGPR; - int Lane; + unsigned VGPR = AMDGPU::NoRegister; + int Lane = -1; + + SpilledReg() = default; SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { } - SpilledReg() : VGPR(AMDGPU::NoRegister), Lane(-1) { } + bool hasLane() { return Lane != -1;} bool hasReg() { return VGPR != AMDGPU::NoRegister;} }; - // SIMachineFunctionInfo definition +private: + // SGPR->VGPR spilling support. + typedef std::pair<unsigned, unsigned> SpillRegMask; + + // Track VGPR + wave index for each subregister of the SGPR spilled to + // frameindex key. + DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills; + unsigned NumVGPRSpillLanes = 0; + SmallVector<unsigned, 2> SpillVGPRs; + +public: SIMachineFunctionInfo(const MachineFunction &MF); - SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex, - unsigned SubIdx); + + ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const { + auto I = SGPRToVGPRSpills.find(FrameIndex); + return (I == SGPRToVGPRSpills.end()) ? + ArrayRef<SpilledReg>() : makeArrayRef(I->second); + } + + bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); + void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); + bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } @@ -206,7 +241,7 @@ public: unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); unsigned addDispatchID(const SIRegisterInfo &TRI); unsigned addFlatScratchInit(const SIRegisterInfo &TRI); - unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI); + unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI); // Add system SGPRs. unsigned addWorkGroupIDX() { @@ -311,8 +346,8 @@ public: return WorkItemIDZ; } - bool hasPrivateMemoryInputPtr() const { - return PrivateMemoryInputPtr; + bool hasImplicitBufferPtr() const { + return ImplicitBufferPtr; } unsigned getNumUserSGPRs() const { @@ -342,17 +377,35 @@ public: return ScratchWaveOffsetReg; } + unsigned getFrameOffsetReg() const { + return FrameOffsetReg; + } + + void setStackPtrOffsetReg(unsigned Reg) { + StackPtrOffsetReg = Reg; + } + + // Note the unset value for this is AMDGPU::SP_REG rather than + // NoRegister. This is mostly a workaround for MIR tests where state that + // can't be directly computed from the function is not preserved in serialized + // MIR. + unsigned getStackPtrOffsetReg() const { + return StackPtrOffsetReg; + } + void setScratchWaveOffsetReg(unsigned Reg) { assert(Reg != AMDGPU::NoRegister && "Should never be unset"); ScratchWaveOffsetReg = Reg; + if (isEntryFunction()) + FrameOffsetReg = ScratchWaveOffsetReg; } unsigned getQueuePtrUserSGPR() const { return QueuePtrUserSGPR; } - unsigned getPrivateMemoryPtrUserSGPR() const { - return PrivateMemoryPtrUserSGPR; + unsigned getImplicitBufferPtrUserSGPR() const { + return ImplicitBufferPtrUserSGPR; } bool hasSpilledSGPRs() const { @@ -399,6 +452,10 @@ public: return PSInputAddr; } + unsigned getPSInputEnable() const { + return PSInputEnable; + } + bool isPSInputAllocated(unsigned Index) const { return PSInputAddr & (1 << Index); } @@ -407,6 +464,10 @@ public: PSInputAddr |= 1 << Index; } + void markPSInputEnabled(unsigned Index) { + PSInputEnable |= 1 << Index; + } + bool returnsVoid() const { return ReturnsVoid; } @@ -503,6 +564,10 @@ public: llvm_unreachable("unexpected dimension"); } + unsigned getLDSWaveSpillSize() const { + return LDSWaveSpillSize; + } + const AMDGPUBufferPseudoSourceValue *getBufferPSV() const { return &BufferPSV; } @@ -512,6 +577,6 @@ public: } }; -} // End namespace llvm +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index da86bbf..34886c4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -12,9 +12,9 @@ // //===----------------------------------------------------------------------===// +#include "SIMachineScheduler.h" #include "AMDGPU.h" #include "SIInstrInfo.h" -#include "SIMachineScheduler.h" #include "SIRegisterInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -38,7 +38,7 @@ using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" // This scheduler implements a different scheduling algorithm than // GenericScheduler. @@ -539,21 +539,30 @@ void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { Preds.push_back(Pred); assert(none_of(Succs, - [=](SIScheduleBlock *S) { return PredID == S->getID(); }) && + [=](std::pair<SIScheduleBlock*, + SIScheduleBlockLinkKind> S) { + return PredID == S.first->getID(); + }) && "Loop in the Block Graph!"); } -void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) { +void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, + SIScheduleBlockLinkKind Kind) { unsigned SuccID = Succ->getID(); // Check if not already predecessor. - for (SIScheduleBlock* S : Succs) { - if (SuccID == S->getID()) + for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { + if (SuccID == S.first->getID()) { + if (S.second == SIScheduleBlockLinkKind::NoData && + Kind == SIScheduleBlockLinkKind::Data) + S.second = Kind; return; + } } if (Succ->isHighLatencyBlock()) ++NumHighLatencySuccessors; - Succs.push_back(Succ); + Succs.push_back(std::make_pair(Succ, Kind)); + assert(none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!"); @@ -573,8 +582,10 @@ void SIScheduleBlock::printDebug(bool full) { } dbgs() << "\nSuccessors:\n"; - for (SIScheduleBlock* S : Succs) { - S->printDebug(false); + for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { + if (S.second == SIScheduleBlockLinkKind::Data) + dbgs() << "(Data Dep) "; + S.first->printDebug(false); } if (Scheduled) { @@ -651,11 +662,21 @@ void SIScheduleBlockCreator::colorHighLatenciesAlone() { } } +static bool +hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { + for (const auto &PredDep : SU.Preds) { + if (PredDep.getSUnit() == &FromSU && + PredDep.getKind() == llvm::SDep::Data) + return true; + } + return false; +} + void SIScheduleBlockCreator::colorHighLatenciesGroups() { unsigned DAGSize = DAG->SUnits.size(); unsigned NumHighLatencies = 0; unsigned GroupSize; - unsigned Color = NextReservedID; + int Color = NextReservedID; unsigned Count = 0; std::set<unsigned> FormingGroup; @@ -675,35 +696,102 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() { else GroupSize = 4; - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[i]; - if (DAG->IsHighLatencySU[SU->NodeNum]) { + for (unsigned SUNum : DAG->TopDownIndex2SU) { + const SUnit &SU = DAG->SUnits[SUNum]; + if (DAG->IsHighLatencySU[SU.NodeNum]) { unsigned CompatibleGroup = true; - unsigned ProposedColor = Color; + int ProposedColor = Color; + std::vector<int> AdditionalElements; + + // We don't want to put in the same block + // two high latency instructions that depend + // on each other. + // One way would be to check canAddEdge + // in both directions, but that currently is not + // enough because there the high latency order is + // enforced (via links). + // Instead, look at the dependencies between the + // high latency instructions and deduce if it is + // a data dependency or not. for (unsigned j : FormingGroup) { - // TODO: Currently CompatibleGroup will always be false, - // because the graph enforces the load order. This - // can be fixed, but as keeping the load order is often - // good for performance that causes a performance hit (both - // the default scheduler and this scheduler). - // When this scheduler determines a good load order, - // this can be fixed. - if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) || - !DAG->canAddEdge(&DAG->SUnits[j], SU)) + bool HasSubGraph; + std::vector<int> SubGraph; + // By construction (topological order), if SU and + // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary + // in the parent graph of SU. +#ifndef NDEBUG + SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], + HasSubGraph); + assert(!HasSubGraph); +#endif + SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, + HasSubGraph); + if (!HasSubGraph) + continue; // No dependencies between each other + else if (SubGraph.size() > 5) { + // Too many elements would be required to be added to the block. CompatibleGroup = false; + break; + } + else { + // Check the type of dependency + for (unsigned k : SubGraph) { + // If in the path to join the two instructions, + // there is another high latency instruction, + // or instructions colored for another block + // abort the merge. + if (DAG->IsHighLatencySU[k] || + (CurrentColoring[k] != ProposedColor && + CurrentColoring[k] != 0)) { + CompatibleGroup = false; + break; + } + // If one of the SU in the subgraph depends on the result of SU j, + // there'll be a data dependency. + if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { + CompatibleGroup = false; + break; + } + } + if (!CompatibleGroup) + break; + // Same check for the SU + if (hasDataDependencyPred(SU, DAG->SUnits[j])) { + CompatibleGroup = false; + break; + } + // Add all the required instructions to the block + // These cannot live in another block (because they + // depend (order dependency) on one of the + // instruction in the block, and are required for the + // high latency instruction we add. + AdditionalElements.insert(AdditionalElements.end(), + SubGraph.begin(), SubGraph.end()); + } + } + if (CompatibleGroup) { + FormingGroup.insert(SU.NodeNum); + for (unsigned j : AdditionalElements) + CurrentColoring[j] = ProposedColor; + CurrentColoring[SU.NodeNum] = ProposedColor; + ++Count; } - if (!CompatibleGroup || ++Count == GroupSize) { + // Found one incompatible instruction, + // or has filled a big enough group. + // -> start a new one. + if (!CompatibleGroup) { FormingGroup.clear(); Color = ++NextReservedID; - if (!CompatibleGroup) { - ProposedColor = Color; - FormingGroup.insert(SU->NodeNum); - } + ProposedColor = Color; + FormingGroup.insert(SU.NodeNum); + CurrentColoring[SU.NodeNum] = ProposedColor; + Count = 0; + } else if (Count == GroupSize) { + FormingGroup.clear(); + Color = ++NextReservedID; + ProposedColor = Color; Count = 0; - } else { - FormingGroup.insert(SU->NodeNum); } - CurrentColoring[SU->NodeNum] = ProposedColor; } } } @@ -835,6 +923,17 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { unsigned DAGSize = DAG->SUnits.size(); std::vector<int> PendingColoring = CurrentColoring; + assert(DAGSize >= 1 && + CurrentBottomUpReservedDependencyColoring.size() == DAGSize && + CurrentTopDownReservedDependencyColoring.size() == DAGSize); + // If there is no reserved block at all, do nothing. We don't want + // everything in one block. + if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(), + CurrentBottomUpReservedDependencyColoring.end()) == 0 && + *std::max_element(CurrentTopDownReservedDependencyColoring.begin(), + CurrentTopDownReservedDependencyColoring.end()) == 0) + return; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { SUnit *SU = &DAG->SUnits[SUNum]; std::set<unsigned> SUColors; @@ -856,6 +955,9 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { SUColors.insert(CurrentColoring[Succ->NodeNum]); SUColorsPending.insert(PendingColoring[Succ->NodeNum]); } + // If there is only one child/parent block, and that block + // is not among the ones we are removing in this path, then + // merge the instruction to that block if (SUColors.size() == 1 && SUColorsPending.size() == 1) PendingColoring[SU->NodeNum] = *SUColors.begin(); else // TODO: Attribute new colors depending on color @@ -974,12 +1076,7 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { for (unsigned SUNum : DAG->BottomUpIndex2SU) { SUnit *SU = &DAG->SUnits[SUNum]; unsigned color = CurrentColoring[SU->NodeNum]; - std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color); - if (Pos != ColorCount.end()) { - ++ColorCount[color]; - } else { - ColorCount[color] = 1; - } + ++ColorCount[color]; } for (unsigned SUNum : DAG->BottomUpIndex2SU) { @@ -1087,7 +1184,8 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) continue; if (Node2CurrentBlock[Succ->NodeNum] != SUID) - CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]); + CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], + SuccDep.isCtrl() ? NoData : Data); } for (SDep& PredDep : SU->Preds) { SUnit *Pred = PredDep.getSUnit(); @@ -1281,10 +1379,8 @@ void SIScheduleBlockCreator::fillStats() { Block->Height = 0; else { unsigned Height = 0; - for (SIScheduleBlock *Succ : Block->getSuccs()) { - if (Height < Succ->Height + 1) - Height = Succ->Height + 1; - } + for (const auto &Succ : Block->getSuccs()) + Height = std::min(Height, Succ.first->Height + 1); Block->Height = Height; } } @@ -1331,13 +1427,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, continue; int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; - std::map<unsigned, unsigned>::iterator RegPos = - LiveOutRegsNumUsages[PredID].find(Reg); - if (RegPos != LiveOutRegsNumUsages[PredID].end()) { - ++LiveOutRegsNumUsages[PredID][Reg]; - } else { - LiveOutRegsNumUsages[PredID][Reg] = 1; - } + ++LiveOutRegsNumUsages[PredID][Reg]; } } @@ -1361,6 +1451,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, std::set<unsigned> InRegs = DAG->getInRegs(); addLiveRegs(InRegs); + // Increase LiveOutRegsNumUsages for blocks + // producing registers consumed in another + // scheduling region. + for (unsigned Reg : DAG->getOutRegs()) { + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + // Do reverse traversal + int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; + SIScheduleBlock *Block = Blocks[ID]; + const std::set<unsigned> &OutRegs = Block->getOutRegs(); + + if (OutRegs.find(Reg) == OutRegs.end()) + continue; + + ++LiveOutRegsNumUsages[ID][Reg]; + break; + } + } + // Fill LiveRegsConsumers for regs that were already // defined before scheduling. for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { @@ -1377,12 +1485,8 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, } } - if (!Found) { - if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end()) - LiveRegsConsumers[Reg] = 1; - else - ++LiveRegsConsumers[Reg]; - } + if (!Found) + ++LiveRegsConsumers[Reg]; } } @@ -1403,6 +1507,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, for (SIScheduleBlock* Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } + dbgs() << '\n'; ); } @@ -1464,8 +1569,8 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { VregCurrentUsage, SregCurrentUsage); if (VregCurrentUsage > maxVregUsage) maxVregUsage = VregCurrentUsage; - if (VregCurrentUsage > maxSregUsage) - maxSregUsage = VregCurrentUsage; + if (SregCurrentUsage > maxSregUsage) + maxSregUsage = SregCurrentUsage; DEBUG( dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; @@ -1556,17 +1661,13 @@ void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, } void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { - for (SIScheduleBlock* Block : Parent->getSuccs()) { - --BlockNumPredsLeft[Block->getID()]; - if (BlockNumPredsLeft[Block->getID()] == 0) { - ReadyBlocks.push_back(Block); - } - // TODO: Improve check. When the dependency between the high latency - // instructions and the instructions of the other blocks are WAR or WAW - // there will be no wait triggered. We would like these cases to not - // update LastPosHighLatencyParentScheduled. - if (Parent->isHighLatencyBlock()) - LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled; + for (const auto &Block : Parent->getSuccs()) { + if (--BlockNumPredsLeft[Block.first->getID()] == 0) + ReadyBlocks.push_back(Block.first); + + if (Parent->isHighLatencyBlock() && + Block.second == SIScheduleBlockLinkKind::Data) + LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; } } @@ -1578,12 +1679,10 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { LiveOutRegsNumUsages[Block->getID()].begin(), E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { std::pair<unsigned, unsigned> RegP = *RegI; - if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end()) - LiveRegsConsumers[RegP.first] = RegP.second; - else { - assert(LiveRegsConsumers[RegP.first] == 0); - LiveRegsConsumers[RegP.first] += RegP.second; - } + // We produce this register, thus it must not be previously alive. + assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || + LiveRegsConsumers[RegP.first] == 0); + LiveRegsConsumers[RegP.first] += RegP.second; } if (LastPosHighLatencyParentScheduled[Block->getID()] > (unsigned)LastPosWaitedHighLatency) @@ -1825,7 +1924,9 @@ void SIScheduleDAGMI::schedule() // if VGPR usage is extremely high, try other good performing variants // which could lead to lower VGPR usage if (Best.MaxVGPRUsage > 180) { - std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { + static const std::pair<SISchedulerBlockCreatorVariant, + SISchedulerBlockSchedulerVariant> + Variants[] = { { LatenciesAlone, BlockRegUsageLatency }, // { LatenciesAlone, BlockRegUsage }, { LatenciesGrouped, BlockLatencyRegUsage }, @@ -1844,7 +1945,9 @@ void SIScheduleDAGMI::schedule() // if VGPR usage is still extremely high, we may spill. Try other variants // which are less performing, but that could lead to lower VGPR usage. if (Best.MaxVGPRUsage > 200) { - std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { + static const std::pair<SISchedulerBlockCreatorVariant, + SISchedulerBlockSchedulerVariant> + Variants[] = { // { LatenciesAlone, BlockRegUsageLatency }, { LatenciesAlone, BlockRegUsage }, // { LatenciesGrouped, BlockLatencyRegUsage }, diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h index 77c0735..122d0f6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h @@ -40,13 +40,12 @@ enum SIScheduleCandReason { struct SISchedulerCandidate { // The reason for this candidate. - SIScheduleCandReason Reason; + SIScheduleCandReason Reason = NoCand; // Set of reasons that apply to multiple candidates. - uint32_t RepeatReasonSet; + uint32_t RepeatReasonSet = 0; - SISchedulerCandidate() - : Reason(NoCand), RepeatReasonSet(0) {} + SISchedulerCandidate() = default; bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); } void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); } @@ -55,6 +54,11 @@ struct SISchedulerCandidate { class SIScheduleDAGMI; class SIScheduleBlockCreator; +enum SIScheduleBlockLinkKind { + NoData, + Data +}; + class SIScheduleBlock { SIScheduleDAGMI *DAG; SIScheduleBlockCreator *BC; @@ -84,8 +88,8 @@ class SIScheduleBlock { std::set<unsigned> LiveInRegs; std::set<unsigned> LiveOutRegs; - bool Scheduled; - bool HighLatencyBlock; + bool Scheduled = false; + bool HighLatencyBlock = false; std::vector<unsigned> HasLowLatencyNonWaitedParent; @@ -93,14 +97,14 @@ class SIScheduleBlock { unsigned ID; std::vector<SIScheduleBlock*> Preds; // All blocks predecessors. - std::vector<SIScheduleBlock*> Succs; // All blocks successors. - unsigned NumHighLatencySuccessors; + // All blocks successors, and the kind of link + std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs; + unsigned NumHighLatencySuccessors = 0; public: SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC, unsigned ID): - DAG(DAG), BC(BC), TopRPTracker(TopPressure), Scheduled(false), - HighLatencyBlock(false), ID(ID), NumHighLatencySuccessors(0) {} + DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {} ~SIScheduleBlock() = default; @@ -114,10 +118,11 @@ public: // Add block pred, which has instruction predecessor of SU. void addPred(SIScheduleBlock *Pred); - void addSucc(SIScheduleBlock *Succ); + void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind); const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; } - const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; } + ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> + getSuccs() const { return Succs; } unsigned Height; // Maximum topdown path length to block without outputs unsigned Depth; // Maximum bottomup path length to block without inputs @@ -213,9 +218,9 @@ struct SIScheduleBlocks { }; enum SISchedulerBlockCreatorVariant { - LatenciesAlone, - LatenciesGrouped, - LatenciesAlonePlusConsecutive + LatenciesAlone, + LatenciesGrouped, + LatenciesAlonePlusConsecutive }; class SIScheduleBlockCreator { @@ -451,6 +456,7 @@ public: LiveIntervals *getLIS() { return LIS; } MachineRegisterInfo *getMRI() { return &MRI; } const TargetRegisterInfo *getTRI() { return TRI; } + ScheduleDAGTopologicalSort *GetTopo() { return &Topo; } SUnit& getEntrySU() { return EntrySU; } SUnit& getExitSU() { return ExitSU; } @@ -469,6 +475,14 @@ public: return InRegs; } + std::set<unsigned> getOutRegs() { + std::set<unsigned> OutRegs; + for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { + OutRegs.insert(RegMaskPair.RegUnit); + } + return OutRegs; + }; + unsigned getVGPRSetID() const { return VGPRSetID; } unsigned getSGPRSetID() const { return SGPRSetID; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/contrib/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp new file mode 100644 index 0000000..e2ac663 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -0,0 +1,846 @@ +//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass tries to apply several peephole SDWA patterns. +/// +/// E.g. original: +/// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1 +/// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3 +/// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2 +/// +/// Replace: +/// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3 +/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include <unordered_map> +#include <unordered_set> + +using namespace llvm; + +#define DEBUG_TYPE "si-peephole-sdwa" + +STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); +STATISTIC(NumSDWAInstructionsPeepholed, + "Number of instruction converted to SDWA."); + +namespace { + +class SDWAOperand; + +class SIPeepholeSDWA : public MachineFunctionPass { +public: + typedef SmallVector<SDWAOperand *, 4> SDWAOperandsVector; + +private: + MachineRegisterInfo *MRI; + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + + std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; + std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches; + SmallVector<MachineInstr *, 8> ConvertedInstructions; + + Optional<int64_t> foldToImm(const MachineOperand &Op) const; + +public: + static char ID; + + SIPeepholeSDWA() : MachineFunctionPass(ID) { + initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + void matchSDWAOperands(MachineFunction &MF); + bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const; + bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); + void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const; + + StringRef getPassName() const override { return "SI Peephole SDWA"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +class SDWAOperand { +private: + MachineOperand *Target; // Operand that would be used in converted instruction + MachineOperand *Replaced; // Operand that would be replace by Target + +public: + SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) + : Target(TargetOp), Replaced(ReplacedOp) { + assert(Target->isReg()); + assert(Replaced->isReg()); + } + + virtual ~SDWAOperand() {} + + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; + virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; + + MachineOperand *getTargetOperand() const { return Target; } + MachineOperand *getReplacedOperand() const { return Replaced; } + MachineInstr *getParentInst() const { return Target->getParent(); } + MachineRegisterInfo *getMRI() const { + return &getParentInst()->getParent()->getParent()->getRegInfo(); + } +}; + +using namespace AMDGPU::SDWA; + +class SDWASrcOperand : public SDWAOperand { +private: + SdwaSel SrcSel; + bool Abs; + bool Neg; + bool Sext; + +public: + SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, + SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, + bool Sext_ = false) + : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), + Neg(Neg_), Sext(Sext_) {} + + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + + SdwaSel getSrcSel() const { return SrcSel; } + bool getAbs() const { return Abs; } + bool getNeg() const { return Neg; } + bool getSext() const { return Sext; } + + uint64_t getSrcMods(const SIInstrInfo *TII, + const MachineOperand *SrcOp) const; +}; + +class SDWADstOperand : public SDWAOperand { +private: + SdwaSel DstSel; + DstUnused DstUn; + +public: + SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, + SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) + : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} + + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + + SdwaSel getDstSel() const { return DstSel; } + DstUnused getDstUnused() const { return DstUn; } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) + +char SIPeepholeSDWA::ID = 0; + +char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; + +FunctionPass *llvm::createSIPeepholeSDWAPass() { + return new SIPeepholeSDWA(); +} + +#ifndef NDEBUG + +static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) { + switch(Sel) { + case BYTE_0: OS << "BYTE_0"; break; + case BYTE_1: OS << "BYTE_1"; break; + case BYTE_2: OS << "BYTE_2"; break; + case BYTE_3: OS << "BYTE_3"; break; + case WORD_0: OS << "WORD_0"; break; + case WORD_1: OS << "WORD_1"; break; + case DWORD: OS << "DWORD"; break; + } + return OS; +} + +static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { + switch(Un) { + case UNUSED_PAD: OS << "UNUSED_PAD"; break; + case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; + case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; + } + return OS; +} + +static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) { + OS << "SDWA src: " << *Src.getTargetOperand() + << " src_sel:" << Src.getSrcSel() + << " abs:" << Src.getAbs() << " neg:" << Src.getNeg() + << " sext:" << Src.getSext() << '\n'; + return OS; +} + +static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) { + OS << "SDWA dst: " << *Dst.getTargetOperand() + << " dst_sel:" << Dst.getDstSel() + << " dst_unused:" << Dst.getDstUnused() << '\n'; + return OS; +} + +#endif + +static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { + assert(To.isReg() && From.isReg()); + To.setReg(From.getReg()); + To.setSubReg(From.getSubReg()); + To.setIsUndef(From.isUndef()); + if (To.isUse()) { + To.setIsKill(From.isKill()); + } else { + To.setIsDead(From.isDead()); + } +} + +static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { + return LHS.isReg() && + RHS.isReg() && + LHS.getReg() == RHS.getReg() && + LHS.getSubReg() == RHS.getSubReg(); +} + +static bool isSubregOf(const MachineOperand &SubReg, + const MachineOperand &SuperReg, + const TargetRegisterInfo *TRI) { + + if (!SuperReg.isReg() || !SubReg.isReg()) + return false; + + if (isSameReg(SuperReg, SubReg)) + return true; + + if (SuperReg.getReg() != SubReg.getReg()) + return false; + + LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg()); + LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg()); + SuperMask |= ~SubMask; + return SuperMask.all(); +} + +uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, + const MachineOperand *SrcOp) const { + uint64_t Mods = 0; + const auto *MI = SrcOp->getParent(); + if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { + if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { + Mods = Mod->getImm(); + } + } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { + if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { + Mods = Mod->getImm(); + } + } + if (Abs || Neg) { + assert(!Sext && + "Float and integer src modifiers can't be set simulteniously"); + Mods |= Abs ? SISrcMods::ABS : 0; + Mods ^= Neg ? SISrcMods::NEG : 0; + } else if (Sext) { + Mods |= SISrcMods::SEXT; + } + + return Mods; +} + +MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { + // For SDWA src operand potential instruction is one that use register + // defined by parent instruction + MachineRegisterInfo *MRI = getMRI(); + MachineOperand *Replaced = getReplacedOperand(); + assert(Replaced->isReg()); + + MachineInstr *PotentialMI = nullptr; + for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) { + // If this is use of another subreg of dst reg then do nothing + if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) + continue; + + // If there exist use of superreg of dst then we should not combine this + // opernad + if (!isSameReg(PotentialMO, *Replaced)) + return nullptr; + + // Check that PotentialMI is only instruction that uses dst reg + if (PotentialMI == nullptr) { + PotentialMI = PotentialMO.getParent(); + } else if (PotentialMI != PotentialMO.getParent()) { + return nullptr; + } + } + + return PotentialMI; +} + +bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { + // Find operand in instruction that matches source operand and replace it with + // target operand. Set corresponding src_sel + + MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); + MachineOperand *SrcMods = + TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); + assert(Src && (Src->isReg() || Src->isImm())); + if (!isSameReg(*Src, *getReplacedOperand())) { + // If this is not src0 then it should be src1 + Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); + SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + + assert(Src && Src->isReg()); + + if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && + !isSameReg(*Src, *getReplacedOperand())) { + // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to + // src2. This is not allowed. + return false; + } + + assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods); + } + copyRegOperand(*Src, *getTargetOperand()); + SrcSel->setImm(getSrcSel()); + SrcMods->setImm(getSrcMods(TII, Src)); + getTargetOperand()->setIsKill(false); + return true; +} + +MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { + // For SDWA dst operand potential instruction is one that defines register + // that this operand uses + MachineRegisterInfo *MRI = getMRI(); + MachineInstr *ParentMI = getParentInst(); + MachineOperand *Replaced = getReplacedOperand(); + assert(Replaced->isReg()); + + for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) { + if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) + continue; + + if (!isSameReg(*Replaced, PotentialMO)) + return nullptr; + + // Check that ParentMI is the only instruction that uses replaced register + for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) { + if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) && + UseMO.getParent() != ParentMI) { + return nullptr; + } + } + + // Due to SSA this should be onle def of replaced register, so return it + return PotentialMO.getParent(); + } + + return nullptr; +} + +bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { + // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused + + if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && + getDstSel() != AMDGPU::SDWA::DWORD) { + // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD + return false; + } + + MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + assert(Operand && + Operand->isReg() && + isSameReg(*Operand, *getReplacedOperand())); + copyRegOperand(*Operand, *getTargetOperand()); + MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); + assert(DstSel); + DstSel->setImm(getDstSel()); + MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); + assert(DstUnused); + DstUnused->setImm(getDstUnused()); + + // Remove original instruction because it would conflict with our new + // instruction by register definition + getParentInst()->eraseFromParent(); + return true; +} + +Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { + if (Op.isImm()) { + return Op.getImm(); + } + + // If this is not immediate then it can be copy of immediate value, e.g.: + // %vreg1<def> = S_MOV_B32 255; + if (Op.isReg()) { + for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { + if (!isSameReg(Op, Def)) + continue; + + const MachineInstr *DefInst = Def.getParent(); + if (!TII->isFoldableCopy(*DefInst)) + return None; + + const MachineOperand &Copied = DefInst->getOperand(1); + if (!Copied.isImm()) + return None; + + return Copied.getImm(); + } + } + + return None; +} + +void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_LSHRREV_B32_e32: + case AMDGPU::V_ASHRREV_I32_e32: + case AMDGPU::V_LSHLREV_B32_e32: + case AMDGPU::V_LSHRREV_B32_e64: + case AMDGPU::V_ASHRREV_I32_e64: + case AMDGPU::V_LSHLREV_B32_e64: { + // from: v_lshrrev_b32_e32 v1, 16/24, v0 + // to SDWA src:v0 src_sel:WORD_1/BYTE_3 + + // from: v_ashrrev_i32_e32 v1, 16/24, v0 + // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 + + // from: v_lshlrev_b32_e32 v1, 16/24, v0 + // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + auto Imm = foldToImm(*Src0); + if (!Imm) + break; + + if (*Imm != 16 && *Imm != 24) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || + Opcode == AMDGPU::V_LSHLREV_B32_e64) { + auto SDWADst = make_unique<SDWADstOperand>( + Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); + SDWAOperands[&MI] = std::move(SDWADst); + ++NumSDWAPatternsFound; + } else { + auto SDWASrc = make_unique<SDWASrcOperand>( + Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, + Opcode != AMDGPU::V_LSHRREV_B32_e32 && + Opcode != AMDGPU::V_LSHRREV_B32_e64); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + } + break; + } + + case AMDGPU::V_LSHRREV_B16_e32: + case AMDGPU::V_ASHRREV_I16_e32: + case AMDGPU::V_LSHLREV_B16_e32: + case AMDGPU::V_LSHRREV_B16_e64: + case AMDGPU::V_ASHRREV_I16_e64: + case AMDGPU::V_LSHLREV_B16_e64: { + // from: v_lshrrev_b16_e32 v1, 8, v0 + // to SDWA src:v0 src_sel:BYTE_1 + + // from: v_ashrrev_i16_e32 v1, 8, v0 + // to SDWA src:v0 src_sel:BYTE_1 sext:1 + + // from: v_lshlrev_b16_e32 v1, 8, v0 + // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + auto Imm = foldToImm(*Src0); + if (!Imm || *Imm != 8) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || + Opcode == AMDGPU::V_LSHLREV_B16_e64) { + auto SDWADst = + make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); + SDWAOperands[&MI] = std::move(SDWADst); + ++NumSDWAPatternsFound; + } else { + auto SDWASrc = make_unique<SDWASrcOperand>( + Src1, Dst, BYTE_1, false, false, + Opcode != AMDGPU::V_LSHRREV_B16_e32 && + Opcode != AMDGPU::V_LSHRREV_B16_e64); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + } + break; + } + + case AMDGPU::V_BFE_I32: + case AMDGPU::V_BFE_U32: { + // e.g.: + // from: v_bfe_u32 v1, v0, 8, 8 + // to SDWA src:v0 src_sel:BYTE_1 + + // offset | width | src_sel + // ------------------------ + // 0 | 8 | BYTE_0 + // 0 | 16 | WORD_0 + // 0 | 32 | DWORD ? + // 8 | 8 | BYTE_1 + // 16 | 8 | BYTE_2 + // 16 | 16 | WORD_1 + // 24 | 8 | BYTE_3 + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + auto Offset = foldToImm(*Src1); + if (!Offset) + break; + + MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + auto Width = foldToImm(*Src2); + if (!Width) + break; + + SdwaSel SrcSel = DWORD; + + if (*Offset == 0 && *Width == 8) + SrcSel = BYTE_0; + else if (*Offset == 0 && *Width == 16) + SrcSel = WORD_0; + else if (*Offset == 0 && *Width == 32) + SrcSel = DWORD; + else if (*Offset == 8 && *Width == 8) + SrcSel = BYTE_1; + else if (*Offset == 16 && *Width == 8) + SrcSel = BYTE_2; + else if (*Offset == 16 && *Width == 16) + SrcSel = WORD_1; + else if (*Offset == 24 && *Width == 8) + SrcSel = BYTE_3; + else + break; + + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src0->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + auto SDWASrc = make_unique<SDWASrcOperand>( + Src0, Dst, SrcSel, false, false, + Opcode == AMDGPU::V_BFE_U32 ? false : true); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + break; + } + case AMDGPU::V_AND_B32_e32: + case AMDGPU::V_AND_B32_e64: { + // e.g.: + // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 + // to SDWA src:v0 src_sel:WORD_0/BYTE_0 + + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + auto ValSrc = Src1; + auto Imm = foldToImm(*Src0); + + if (!Imm) { + Imm = foldToImm(*Src1); + ValSrc = Src0; + } + + if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) + break; + + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + auto SDWASrc = make_unique<SDWASrcOperand>( + ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + break; + } + } + } + } +} + +bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, + const SISubtarget &ST) const { + // Check if this instruction has opcode that supports SDWA + int Opc = MI.getOpcode(); + if (AMDGPU::getSDWAOp(Opc) == -1) + Opc = AMDGPU::getVOPe32(Opc); + + if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1) + return false; + + if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return false; + + if (TII->isVOPC(Opc)) { + if (!ST.hasSDWASdst()) { + const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + if (SDst && SDst->getReg() != AMDGPU::VCC) + return false; + } + + if (!ST.hasSDWAOutModsVOPC() && + (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || + TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) + return false; + + } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || + !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { + return false; + } + + if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 || + Opc == AMDGPU::V_MAC_F32_e32)) + return false; + + return true; +} + +bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, + const SDWAOperandsVector &SDWAOperands) { + // Convert to sdwa + int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode()); + if (SDWAOpcode == -1) + SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode())); + assert(SDWAOpcode != -1); + + const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); + + // Create SDWA version of instruction MI and initialize its operands + MachineInstrBuilder SDWAInst = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); + + // Copy dst, if it is present in original then should also be present in SDWA + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (Dst) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); + SDWAInst.add(*Dst); + } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { + assert(Dst && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); + SDWAInst.add(*Dst); + } else { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); + SDWAInst.addReg(AMDGPU::VCC, RegState::Define); + } + + // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and + // src0_modifiers (except for v_nop_sdwa, but it can't get here) + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + assert( + Src0 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); + if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) + SDWAInst.addImm(Mod->getImm()); + else + SDWAInst.addImm(0); + SDWAInst.add(*Src0); + + // Copy src1 if present, initialize src1_modifiers. + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) { + assert( + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); + if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) + SDWAInst.addImm(Mod->getImm()); + else + SDWAInst.addImm(0); + SDWAInst.add(*Src1); + } + + if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || + SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { + // v_mac_f16/32 has additional src2 operand tied to vdst + MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + assert(Src2); + SDWAInst.add(*Src2); + } + + // Copy clamp if present, initialize otherwise + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); + MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); + if (Clamp) { + SDWAInst.add(*Clamp); + } else { + SDWAInst.addImm(0); + } + + // Copy omod if present, initialize otherwise if needed + if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { + MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); + if (OMod) { + SDWAInst.add(*OMod); + } else { + SDWAInst.addImm(0); + } + } + + // Initialize dst_sel if present + if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } + + // Initialize dst_unused if present + if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { + SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); + } + + // Initialize src0_sel + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + + + // Initialize src1_sel if present + if (Src1) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } + + // Apply all sdwa operand pattenrs + bool Converted = false; + for (auto &Operand : SDWAOperands) { + // There should be no intesection between SDWA operands and potential MIs + // e.g.: + // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 + // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 + // v_add_u32 v3, v4, v2 + // + // In that example it is possible that we would fold 2nd instruction into 3rd + // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was + // already destroyed). So if SDWAOperand is also a potential MI then do not + // apply it. + if (PotentialMatches.count(Operand->getParentInst()) == 0) + Converted |= Operand->convertToSDWA(*SDWAInst, TII); + } + if (Converted) { + ConvertedInstructions.push_back(SDWAInst); + } else { + SDWAInst->eraseFromParent(); + return false; + } + + DEBUG(dbgs() << "Convert instruction:" << MI + << "Into:" << *SDWAInst << '\n'); + ++NumSDWAInstructionsPeepholed; + + MI.eraseFromParent(); + return true; +} + +// If an instruction was converted to SDWA it should not have immediates or SGPR +// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. +void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const { + const MCInstrDesc &Desc = TII->get(MI.getOpcode()); + unsigned ConstantBusCount = 0; + for (MachineOperand &Op: MI.explicit_uses()) { + if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) + continue; + + unsigned I = MI.getOperandNo(&Op); + if (Desc.OpInfo[I].RegClass == -1 || + !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) + continue; + + if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && + TRI->isSGPRReg(*MRI, Op.getReg())) { + ++ConstantBusCount; + continue; + } + + unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), + TII->get(AMDGPU::V_MOV_B32_e32), VGPR); + if (Op.isImm()) + Copy.addImm(Op.getImm()); + else if (Op.isReg()) + Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, + Op.getSubReg()); + Op.ChangeToRegister(VGPR, false); + } +} + +bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + + if (!ST.hasSDWA()) + return false; + + MRI = &MF.getRegInfo(); + TRI = ST.getRegisterInfo(); + TII = ST.getInstrInfo(); + + // Find all SDWA operands in MF. + matchSDWAOperands(MF); + + for (const auto &OperandPair : SDWAOperands) { + const auto &Operand = OperandPair.second; + MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { + PotentialMatches[PotentialMI].push_back(Operand.get()); + } + } + + for (auto &PotentialPair : PotentialMatches) { + MachineInstr &PotentialMI = *PotentialPair.first; + convertToSDWA(PotentialMI, PotentialPair.second); + } + + PotentialMatches.clear(); + SDWAOperands.clear(); + + bool Ret = !ConvertedInstructions.empty(); + while (!ConvertedInstructions.empty()) + legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); + + return Ret; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index a1ed5e8..4a3fbb4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -13,9 +13,9 @@ //===----------------------------------------------------------------------===// #include "SIRegisterInfo.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" -#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RegisterScavenging.h" @@ -24,12 +24,6 @@ using namespace llvm; -static cl::opt<bool> EnableSpillSGPRToSMEM( - "amdgpu-spill-sgpr-to-smem", - cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), - cl::init(false)); - - static bool hasPressureSet(const int *PSets, unsigned PSetID) { for (unsigned i = 0; PSets[i] != -1; ++i) { if (PSets[i] == (int)PSetID) @@ -49,9 +43,28 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, } } -SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(), - SGPRPressureSets(getNumRegPressureSets()), - VGPRPressureSets(getNumRegPressureSets()) { +static cl::opt<bool> EnableSpillSGPRToSMEM( + "amdgpu-spill-sgpr-to-smem", + cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), + cl::init(false)); + +static cl::opt<bool> EnableSpillSGPRToVGPR( + "amdgpu-spill-sgpr-to-vgpr", + cl::desc("Enable spilling VGPRs to SGPRs"), + cl::ReallyHidden, + cl::init(true)); + +SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) : + AMDGPURegisterInfo(), + SGPRPressureSets(getNumRegPressureSets()), + VGPRPressureSets(getNumRegPressureSets()), + SpillSGPRToVGPR(false), + SpillSGPRToSMEM(false) { + if (EnableSpillSGPRToSMEM && ST.hasScalarStores()) + SpillSGPRToSMEM = true; + else if (EnableSpillSGPRToVGPR) + SpillSGPRToVGPR = true; + unsigned NumRegPressureSets = getNumRegPressureSets(); SGPRSetID = NumRegPressureSets; @@ -97,17 +110,17 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - unsigned BaseIdx = alignDown(getMaxNumSGPRs(MF), 4) - 4; + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); } -unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( - const MachineFunction &MF) const { - unsigned RegCount = getMaxNumSGPRs(MF); +static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { unsigned Reg; - // Try to place it in a hole after PrivateSegmentbufferReg. + // Try to place it in a hole after PrivateSegmentBufferReg. if (RegCount & 3) { // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to // alignment constraints, so we have a hole where can put the wave offset. @@ -117,9 +130,22 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( // wave offset before it. Reg = RegCount - 5; } + + return Reg; +} + +unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); return AMDGPU::SGPR_32RegClass.getRegister(Reg); } +unsigned SIRegisterInfo::reservedStackPtrOffsetReg( + const MachineFunction &MF) const { + return AMDGPU::SGPR32; +} + BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); @@ -129,6 +155,15 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); + // M0 has to be reserved so that llvm accepts it as a live-in into a block. + reserveRegisterTuples(Reserved, AMDGPU::M0); + + // Reserve the memory aperture registers. + reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); + reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); + reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); + reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); + // Reserve Trap Handler registers - support is not implemented in Codegen. reserveRegisterTuples(Reserved, AMDGPU::TBA); reserveRegisterTuples(Reserved, AMDGPU::TMA); @@ -139,14 +174,16 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); - unsigned MaxNumSGPRs = getMaxNumSGPRs(MF); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + + unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); reserveRegisterTuples(Reserved, Reg); } - unsigned MaxNumVGPRs = getMaxNumVGPRs(MF); + unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); @@ -170,15 +207,37 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } + // We have to assume the SP is needed in case there are calls in the function, + // which is detected after the function is lowered. If we aren't really going + // to need SP, don't bother reserving it. + unsigned StackPtrReg = MFI->getStackPtrOffsetReg(); + + if (StackPtrReg != AMDGPU::NoRegister) { + reserveRegisterTuples(Reserved, StackPtrReg); + assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); + } + + unsigned FrameReg = MFI->getFrameOffsetReg(); + if (FrameReg != AMDGPU::NoRegister) { + reserveRegisterTuples(Reserved, FrameReg); + assert(!isSubRegister(ScratchRSrcReg, FrameReg)); + } + return Reserved; } bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { - return Fn.getFrameInfo().hasStackObjects(); + const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); + if (Info->isEntryFunction()) { + const MachineFrameInfo &MFI = Fn.getFrameInfo(); + return MFI.hasStackObjects() || MFI.hasCalls(); + } + + // May need scavenger for dealing with callee saved registers. + return true; } -bool -SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { +bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { return MF.getFrameInfo().hasStackObjects(); } @@ -253,7 +312,6 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, } MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -263,8 +321,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) .addFrameIndex(FrameIdx); - BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg) - .addReg(UnusedCarry, RegState::Define | RegState::Dead) + TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) .addReg(OffsetReg, RegState::Kill) .addReg(FIReg); } @@ -292,8 +349,11 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); assert(FIOp && FIOp->isFI() && "frame index must be address operand"); - assert(TII->isMUBUF(MI)); + assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == + MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() && + "should only be seeing frame offset relative FrameIndex"); + MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); int64_t NewOffset = OffsetOp->getImm() + Offset; @@ -415,14 +475,14 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg(); BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .addReg(Reg, getDefRegState(!IsStore)) - .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) - .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + .addReg(Reg, getDefRegState(!IsStore)) + .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); return true; } @@ -545,11 +605,20 @@ static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize, AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; } -void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, +bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, - RegScavenger *RS) const { + RegScavenger *RS, + bool OnlyToVGPR) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MBB->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + + ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills + = MFI->getSGPRToVGPRSpills(Index); + bool SpillToVGPR = !VGPRSpills.empty(); + if (OnlyToVGPR && !SpillToVGPR) + return false; + MachineRegisterInfo &MRI = MF->getRegInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -558,10 +627,11 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, bool IsKill = MI->getOperand(0).isKill(); const DebugLoc &DL = MI->getDebugLoc(); - SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; + bool SpillToSMEM = spillSGPRToSMEM(); + if (SpillToSMEM && OnlyToVGPR) + return false; assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); @@ -582,7 +652,8 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be // able to spill wider vmem spills. - std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true); + std::tie(EltSize, ScalarStoreOp) = + getSpillEltSize(getRegSizeInBits(*RC) / 8, true); } ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); @@ -617,11 +688,11 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getScratchWaveOffsetReg()) + .addReg(MFI->getFrameOffsetReg()) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getScratchWaveOffsetReg()); + .addReg(MFI->getFrameOffsetReg()); } BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) @@ -634,9 +705,9 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, continue; } - struct SIMachineFunctionInfo::SpilledReg Spill = - MFI->getSpilledReg(MF, Index, i); - if (Spill.hasReg()) { + if (SpillToVGPR) { + SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; + BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) @@ -647,6 +718,10 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, // frame index, we should delete the frame index when all references to // it are fixed. } else { + // XXX - Can to VGPR spill fail for some subregisters but not others? + if (OnlyToVGPR) + return false; + // Spill SGPR to a frame index. // TODO: Should VI try to spill to VGPR and then spill to SMEM? unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -674,11 +749,11 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpReg, RegState::Kill) // src - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srrsrc - .addReg(MFI->getScratchWaveOffsetReg()) // soffset - .addImm(i * 4) // offset + .addReg(TmpReg, RegState::Kill) // src + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srrsrc + .addReg(MFI->getFrameOffsetReg()) // soffset + .addImm(i * 4) // offset .addMemOperand(MMO); } } @@ -690,22 +765,33 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, MI->eraseFromParent(); MFI->addToSpilledSGPRs(NumSubRegs); + return true; } -void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, +bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, - RegScavenger *RS) const { + RegScavenger *RS, + bool OnlyToVGPR) const { MachineFunction *MF = MI->getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + + ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills + = MFI->getSGPRToVGPRSpills(Index); + bool SpillToVGPR = !VGPRSpills.empty(); + if (OnlyToVGPR && !SpillToVGPR) + return false; + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); unsigned SuperReg = MI->getOperand(0).getReg(); - bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; + bool SpillToSMEM = spillSGPRToSMEM(); + if (SpillToSMEM && OnlyToVGPR) + return false; assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); @@ -727,7 +813,8 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be // able to spill wider vmem spills. - std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false); + std::tie(EltSize, ScalarLoadOp) = + getSpillEltSize(getRegSizeInBits(*RC) / 8, false); } ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); @@ -753,11 +840,11 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getScratchWaveOffsetReg()) + .addReg(MFI->getFrameOffsetReg()) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getScratchWaveOffsetReg()); + .addReg(MFI->getFrameOffsetReg()); } auto MIB = @@ -773,10 +860,8 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, continue; } - SIMachineFunctionInfo::SpilledReg Spill - = MFI->getSpilledReg(MF, Index, i); - - if (Spill.hasReg()) { + if (SpillToVGPR) { + SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; auto MIB = BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) @@ -786,6 +871,9 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, if (NumSubRegs > 1) MIB.addReg(SuperReg, RegState::ImplicitDefine); } else { + if (OnlyToVGPR) + return false; + // Restore SGPR from a stack slot. // FIXME: We should use S_LOAD_DWORD here for VI. unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -799,10 +887,10 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srsrc - .addReg(MFI->getScratchWaveOffsetReg()) // soffset - .addImm(i * 4) // offset + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srsrc + .addReg(MFI->getFrameOffsetReg()) // soffset + .addImm(i * 4) // offset .addMemOperand(MMO); auto MIB = @@ -820,6 +908,32 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, } MI->eraseFromParent(); + return true; +} + +/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to +/// a VGPR and the stack slot can be safely eliminated when all other users are +/// handled. +bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( + MachineBasicBlock::iterator MI, + int FI, + RegScavenger *RS) const { + switch (MI->getOpcode()) { + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S32_SAVE: + return spillSGPR(MI, FI, RS, true); + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_S32_RESTORE: + return restoreSGPR(MI, FI, RS, true); + default: + llvm_unreachable("not an SGPR spill instruction"); + } } void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, @@ -901,12 +1015,83 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } default: { - if (TII->isMUBUF(*MI)) { + const DebugLoc &DL = MI->getDebugLoc(); + bool IsMUBUF = TII->isMUBUF(*MI); + + if (!IsMUBUF && + MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) { + // Convert to an absolute stack address by finding the offset from the + // scratch wave base and scaling by the wave size. + // + // In an entry function/kernel the stack address is already the absolute + // address relative to the the scratch wave offset. + + unsigned DiffReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; + unsigned ResultReg = IsCopy ? + MI->getOperand(0).getReg() : + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) + .addReg(MFI->getFrameOffsetReg()) + .addReg(MFI->getScratchWaveOffsetReg()); + + int64_t Offset = FrameInfo.getObjectOffset(Index); + if (Offset == 0) { + // XXX - This never happens because of emergency scavenging slot at 0? + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) + .addImm(Log2_32(ST.getWavefrontSize())) + .addReg(DiffReg); + } else { + unsigned CarryOut + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned ScaledReg + = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) + .addImm(Log2_32(ST.getWavefrontSize())) + .addReg(DiffReg, RegState::Kill); + + // TODO: Fold if use instruction is another add of a constant. + if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg) + .addReg(CarryOut, RegState::Define | RegState::Dead) + .addImm(Offset) + .addReg(ScaledReg, RegState::Kill); + } else { + unsigned ConstOffsetReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) + .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg) + .addReg(CarryOut, RegState::Define | RegState::Dead) + .addReg(ConstOffsetReg, RegState::Kill) + .addReg(ScaledReg, RegState::Kill); + } + + MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC); + } + + // Don't introduce an extra copy if we're just materializing in a mov. + if (IsCopy) + MI->eraseFromParent(); + else + FIOp.ChangeToRegister(ResultReg, false, false, true); + return; + } + + if (IsMUBUF) { // Disable offen so we don't need a 0 vgpr base. assert(static_cast<int>(FIOperandNum) == AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() + == MFI->getFrameOffsetReg()); + int64_t Offset = FrameInfo.getObjectOffset(Index); int64_t OldImm = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); @@ -915,23 +1100,85 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (isUInt<12>(NewOffset) && buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { MI->eraseFromParent(); - break; + return; } } + // If the offset is simply too big, don't convert to a scratch wave offset + // relative index. + int64_t Offset = FrameInfo.getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) - .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false, false, true); } } } } +StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { + #define AMDGPU_REG_ASM_NAMES + #include "AMDGPURegAsmNames.inc.cpp" + + #define REG_RANGE(BeginReg, EndReg, RegTable) \ + if (Reg >= BeginReg && Reg <= EndReg) { \ + unsigned Index = Reg - BeginReg; \ + assert(Index < array_lengthof(RegTable)); \ + return RegTable[Index]; \ + } + + REG_RANGE(AMDGPU::VGPR0, AMDGPU::VGPR255, VGPR32RegNames); + REG_RANGE(AMDGPU::SGPR0, AMDGPU::SGPR103, SGPR32RegNames); + REG_RANGE(AMDGPU::VGPR0_VGPR1, AMDGPU::VGPR254_VGPR255, VGPR64RegNames); + REG_RANGE(AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR102_SGPR103, SGPR64RegNames); + REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2, AMDGPU::VGPR253_VGPR254_VGPR255, + VGPR96RegNames); + + REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3, + AMDGPU::VGPR252_VGPR253_VGPR254_VGPR255, + VGPR128RegNames); + REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, + AMDGPU::SGPR100_SGPR101_SGPR102_SGPR103, + SGPR128RegNames); + + REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7, + AMDGPU::VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255, + VGPR256RegNames); + + REG_RANGE( + AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15, + AMDGPU::VGPR240_VGPR241_VGPR242_VGPR243_VGPR244_VGPR245_VGPR246_VGPR247_VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255, + VGPR512RegNames); + + REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7, + AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103, + SGPR256RegNames); + + REG_RANGE( + AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7_SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15, + AMDGPU::SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95_SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103, + SGPR512RegNames + ); + +#undef REG_RANGE + + // FIXME: Rename flat_scr so we don't need to special case this. + switch (Reg) { + case AMDGPU::FLAT_SCR: + return "flat_scratch"; + case AMDGPU::FLAT_SCR_LO: + return "flat_scratch_lo"; + case AMDGPU::FLAT_SCR_HI: + return "flat_scratch_hi"; + default: + // For the special named registers the default is fine. + return TargetRegisterInfo::getRegAsmName(Reg); + } +} + // FIXME: This is very slow. It might be worth creating a map from physreg to // register class. const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { @@ -963,20 +1210,21 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { // TODO: It might be helpful to have some target specific flags in // TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { - switch (RC->getSize()) { - case 0: return false; - case 1: return false; - case 4: + unsigned Size = getRegSizeInBits(*RC); + if (Size < 32) + return false; + switch (Size) { + case 32: return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; - case 8: + case 64: return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; - case 12: + case 96: return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; - case 16: + case 128: return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; - case 32: + case 256: return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; - case 64: + case 512: return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; default: llvm_unreachable("Invalid register class size"); @@ -985,18 +1233,18 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( const TargetRegisterClass *SRC) const { - switch (SRC->getSize()) { - case 4: + switch (getRegSizeInBits(*SRC)) { + case 32: return &AMDGPU::VGPR_32RegClass; - case 8: + case 64: return &AMDGPU::VReg_64RegClass; - case 12: + case 96: return &AMDGPU::VReg_96RegClass; - case 16: + case 128: return &AMDGPU::VReg_128RegClass; - case 32: + case 256: return &AMDGPU::VReg_256RegClass; - case 64: + case 512: return &AMDGPU::VReg_512RegClass; default: llvm_unreachable("Invalid register class size"); @@ -1005,16 +1253,16 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( const TargetRegisterClass *VRC) const { - switch (VRC->getSize()) { - case 4: + switch (getRegSizeInBits(*VRC)) { + case 32: return &AMDGPU::SGPR_32RegClass; - case 8: + case 64: return &AMDGPU::SReg_64RegClass; - case 16: + case 128: return &AMDGPU::SReg_128RegClass; - case 32: + case 256: return &AMDGPU::SReg_256RegClass; - case 64: + case 512: return &AMDGPU::SReg_512RegClass; default: llvm_unreachable("Invalid register class size"); @@ -1108,12 +1356,11 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: - if (ST.isAmdCodeObjectV2(MF)) { - assert(MFI->hasPrivateSegmentBuffer()); - return MFI->PrivateSegmentBufferUserSGPR; - } - assert(MFI->hasPrivateMemoryInputPtr()); - return MFI->PrivateMemoryPtrUserSGPR; + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; + case SIRegisterInfo::IMPLICIT_BUFFER_PTR: + assert(MFI->hasImplicitBufferPtr()); + return MFI->ImplicitBufferPtrUserSGPR; case SIRegisterInfo::KERNARG_SEGMENT_PTR: assert(MFI->hasKernargSegmentPtr()); return MFI->KernargSegmentPtrUserSGPR; @@ -1156,210 +1403,6 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, return AMDGPU::NoRegister; } -unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const { - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 800; - return 512; -} - -unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const { - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 102; - return 104; -} - -unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST, - const SIMachineFunctionInfo &MFI) const { - if (MFI.hasFlatScratchInit()) { - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 6; // FLAT_SCRATCH, XNACK, VCC (in that order) - - if (ST.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) - return 4; // FLAT_SCRATCH, VCC (in that order) - } - - if (ST.isXNACKEnabled()) - return 4; // XNACK, VCC (in that order) - - return 2; // VCC. -} - -unsigned SIRegisterInfo::getMinNumSGPRs(const SISubtarget &ST, - unsigned WavesPerEU) const { - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - switch (WavesPerEU) { - case 0: return 0; - case 10: return 0; - case 9: return 0; - case 8: return 81; - default: return 97; - } - } else { - switch (WavesPerEU) { - case 0: return 0; - case 10: return 0; - case 9: return 49; - case 8: return 57; - case 7: return 65; - case 6: return 73; - case 5: return 81; - default: return 97; - } - } -} - -unsigned SIRegisterInfo::getMaxNumSGPRs(const SISubtarget &ST, - unsigned WavesPerEU, - bool Addressable) const { - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - switch (WavesPerEU) { - case 0: return 80; - case 10: return 80; - case 9: return 80; - case 8: return 96; - default: return Addressable ? getNumAddressableSGPRs(ST) : 112; - } - } else { - switch (WavesPerEU) { - case 0: return 48; - case 10: return 48; - case 9: return 56; - case 8: return 64; - case 7: return 72; - case 6: return 80; - case 5: return 96; - default: return getNumAddressableSGPRs(ST); - } - } -} - -unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const { - const Function &F = *MF.getFunction(); - - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - - // Compute maximum number of SGPRs function can use using default/requested - // minimum number of waves per execution unit. - std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); - unsigned MaxNumSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, false); - unsigned MaxNumAddressableSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, true); - - // Check if maximum number of SGPRs was explicitly requested using - // "amdgpu-num-sgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-sgpr")) { - unsigned Requested = AMDGPU::getIntegerAttribute( - F, "amdgpu-num-sgpr", MaxNumSGPRs); - - // Make sure requested value does not violate subtarget's specifications. - if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI))) - Requested = 0; - - // If more SGPRs are required to support the input user/system SGPRs, - // increase to accommodate them. - // - // FIXME: This really ends up using the requested number of SGPRs + number - // of reserved special registers in total. Theoretically you could re-use - // the last input registers for these special registers, but this would - // require a lot of complexity to deal with the weird aliasing. - unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs(); - if (Requested && Requested < NumInputSGPRs) - Requested = NumInputSGPRs; - - // Make sure requested value is compatible with values implied by - // default/requested minimum/maximum number of waves per execution unit. - if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first, false)) - Requested = 0; - if (WavesPerEU.second && - Requested && Requested < getMinNumSGPRs(ST, WavesPerEU.second)) - Requested = 0; - - if (Requested) - MaxNumSGPRs = Requested; - } - - if (ST.hasSGPRInitBug()) - MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; - - return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI), - MaxNumAddressableSGPRs); -} - -unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs( - const SISubtarget &ST) const { - if (ST.debuggerReserveRegs()) - return 4; - return 0; -} - -unsigned SIRegisterInfo::getMinNumVGPRs(unsigned WavesPerEU) const { - switch (WavesPerEU) { - case 0: return 0; - case 10: return 0; - case 9: return 25; - case 8: return 29; - case 7: return 33; - case 6: return 37; - case 5: return 41; - case 4: return 49; - case 3: return 65; - case 2: return 85; - default: return 129; - } -} - -unsigned SIRegisterInfo::getMaxNumVGPRs(unsigned WavesPerEU) const { - switch (WavesPerEU) { - case 0: return 24; - case 10: return 24; - case 9: return 28; - case 8: return 32; - case 7: return 36; - case 6: return 40; - case 5: return 48; - case 4: return 64; - case 3: return 84; - case 2: return 128; - default: return getTotalNumVGPRs(); - } -} - -unsigned SIRegisterInfo::getMaxNumVGPRs(const MachineFunction &MF) const { - const Function &F = *MF.getFunction(); - - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - - // Compute maximum number of VGPRs function can use using default/requested - // minimum number of waves per execution unit. - std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); - unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); - - // Check if maximum number of VGPRs was explicitly requested using - // "amdgpu-num-vgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-vgpr")) { - unsigned Requested = AMDGPU::getIntegerAttribute( - F, "amdgpu-num-vgpr", MaxNumVGPRs); - - // Make sure requested value does not violate subtarget's specifications. - if (Requested && Requested <= getNumDebuggerReservedVGPRs(ST)) - Requested = 0; - - // Make sure requested value is compatible with values implied by - // default/requested minimum/maximum number of waves per execution unit. - if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) - Requested = 0; - if (WavesPerEU.second && - Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) - Requested = 0; - - if (Requested) - MaxNumVGPRs = Requested; - } - - return MaxNumVGPRs - getNumDebuggerReservedVGPRs(ST); -} - ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const { if (EltSize == 4) { @@ -1476,3 +1519,62 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const { return hasVGPRs(getRegClassForReg(MRI, Reg)); } + +bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const { + unsigned SrcSize = getRegSizeInBits(*SrcRC); + unsigned DstSize = getRegSizeInBits(*DstRC); + unsigned NewSize = getRegSizeInBits(*NewRC); + + // Do not increase size of registers beyond dword, we would need to allocate + // adjacent registers and constraint regalloc more than needed. + + // Always allow dword coalescing. + if (SrcSize <= 32 || DstSize <= 32) + return true; + + return NewSize <= DstSize || NewSize <= SrcSize; +} + +unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), + *MF.getFunction()); + switch (RC->getID()) { + default: + return AMDGPURegisterInfo::getRegPressureLimit(RC, MF); + case AMDGPU::VGPR_32RegClassID: + return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); + case AMDGPU::SGPR_32RegClassID: + return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); + } +} + +unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const { + if (Idx == getVGPRPressureSet()) + return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, + const_cast<MachineFunction &>(MF)); + + if (Idx == getSGPRPressureSet()) + return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, + const_cast<MachineFunction &>(MF)); + + return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx); +} + +const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { + static const int Empty[] = { -1 }; + + if (hasRegUnit(AMDGPU::M0, RegUnit)) + return Empty; + return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 0bcae7d..600cc88 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -16,13 +16,14 @@ #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "llvm/CodeGen/MachineRegisterInfo.h" namespace llvm { -class SISubtarget; class MachineRegisterInfo; +class SISubtarget; class SIMachineFunctionInfo; class SIRegisterInfo final : public AMDGPURegisterInfo { @@ -31,13 +32,22 @@ private: unsigned VGPRSetID; BitVector SGPRPressureSets; BitVector VGPRPressureSets; + bool SpillSGPRToVGPR; + bool SpillSGPRToSMEM; void reserveRegisterTuples(BitVector &, unsigned Reg) const; void classifyPressureSet(unsigned PSetID, unsigned Reg, BitVector &PressureSets) const; - public: - SIRegisterInfo(); + SIRegisterInfo(const SISubtarget &ST); + + bool spillSGPRToVGPR() const { + return SpillSGPRToVGPR; + } + + bool spillSGPRToSMEM() const { + return SpillSGPRToSMEM; + } /// Return the end register initially reserved for the scratch buffer in case /// spilling is needed. @@ -48,8 +58,22 @@ public: unsigned reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const; + unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; + + // Stack access is very expensive. CSRs are also the high registers, and we + // want to minimize the number of used registers. + unsigned getCSRFirstUseCost() const override { + return 100; + } + + unsigned getFrameRegister(const MachineFunction &MF) const override; + bool requiresRegisterScavenging(const MachineFunction &Fn) const override; bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; @@ -78,16 +102,24 @@ public: const TargetRegisterClass *getPointerRegClass( const MachineFunction &MF, unsigned Kind = 0) const override; - void spillSGPR(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS) const; + /// If \p OnlyToVGPR is true, this will only succeed if this + bool spillSGPR(MachineBasicBlock::iterator MI, + int FI, RegScavenger *RS, + bool OnlyToVGPR = false) const; - void restoreSGPR(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS) const; + bool restoreSGPR(MachineBasicBlock::iterator MI, + int FI, RegScavenger *RS, + bool OnlyToVGPR = false) const; void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override; + bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, + int FI, RegScavenger *RS) const; + + StringRef getRegAsmName(unsigned Reg) const override; + unsigned getHWRegIndex(unsigned Reg) const { return getEncodingValue(Reg) & 0xff; } @@ -165,12 +197,13 @@ public: WORKGROUP_ID_Y = 11, WORKGROUP_ID_Z = 12, PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + IMPLICIT_BUFFER_PTR = 15, // VGPRS: - FIRST_VGPR_VALUE = 15, + FIRST_VGPR_VALUE = 16, WORKITEM_ID_X = FIRST_VGPR_VALUE, - WORKITEM_ID_Y = 16, - WORKITEM_ID_Z = 17 + WORKITEM_ID_Y = 17, + WORKITEM_ID_Z = 18 }; /// \brief Returns the physical register that \p Value is stored in. @@ -195,74 +228,28 @@ public: return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID); } - /// \returns SGPR allocation granularity supported by the subtarget. - unsigned getSGPRAllocGranule() const { - return 8; - } - - /// \returns Total number of SGPRs supported by the subtarget. - unsigned getTotalNumSGPRs(const SISubtarget &ST) const; - - /// \returns Number of addressable SGPRs supported by the subtarget. - unsigned getNumAddressableSGPRs(const SISubtarget &ST) const; - - /// \returns Number of reserved SGPRs supported by the subtarget. - unsigned getNumReservedSGPRs(const SISubtarget &ST, - const SIMachineFunctionInfo &MFI) const; - - /// \returns Minimum number of SGPRs that meets given number of waves per - /// execution unit requirement for given subtarget. - unsigned getMinNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const; - - /// \returns Maximum number of SGPRs that meets given number of waves per - /// execution unit requirement for given subtarget. - unsigned getMaxNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU, - bool Addressable) const; - - /// \returns Maximum number of SGPRs that meets number of waves per execution - /// unit requirement for function \p MF, or number of SGPRs explicitly - /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. - /// - /// \returns Value that meets number of waves per execution unit requirement - /// if explicitly requested value cannot be converted to integer, violates - /// subtarget's specifications, or does not meet number of waves per execution - /// unit requirement. - unsigned getMaxNumSGPRs(const MachineFunction &MF) const; - - /// \returns VGPR allocation granularity supported by the subtarget. - unsigned getVGPRAllocGranule() const { - return 4; - } - - /// \returns Total number of VGPRs supported by the subtarget. - unsigned getTotalNumVGPRs() const { - return 256; - } + ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC, + unsigned EltSize) const; - /// \returns Number of reserved VGPRs for debugger use supported by the - /// subtarget. - unsigned getNumDebuggerReservedVGPRs(const SISubtarget &ST) const; + bool shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const override; - /// \returns Minimum number of SGPRs that meets given number of waves per - /// execution unit requirement. - unsigned getMinNumVGPRs(unsigned WavesPerEU) const; + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const override; - /// \returns Maximum number of VGPRs that meets given number of waves per - /// execution unit requirement. - unsigned getMaxNumVGPRs(unsigned WavesPerEU) const; + unsigned getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override; - /// \returns Maximum number of VGPRs that meets number of waves per execution - /// unit requirement for function \p MF, or number of VGPRs explicitly - /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. - /// - /// \returns Value that meets number of waves per execution unit requirement - /// if explicitly requested value cannot be converted to integer, violates - /// subtarget's specifications, or does not meet number of waves per execution - /// unit requirement. - unsigned getMaxNumVGPRs(const MachineFunction &MF) const; + const int *getRegUnitPressureSets(unsigned RegUnit) const override; - ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC, - unsigned EltSize) const; + unsigned getReturnAddressReg(const MachineFunction &MF) const { + // Not a callee saved register. + return AMDGPU::SGPR30_SGPR31; + } private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 31e714b..d097b78 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -23,6 +23,13 @@ class SIReg <string n, bits<16> regIdx = 0> : Register<n>, def VCC_LO : SIReg<"vcc_lo", 106>; def VCC_HI : SIReg<"vcc_hi", 107>; +// Pseudo-registers: Used as placeholders during isel and immediately +// replaced, never seeing the verifier. +def PRIVATE_RSRC_REG : SIReg<"", 0>; +def FP_REG : SIReg<"", 0>; +def SP_REG : SIReg<"", 0>; +def SCRATCH_WAVE_OFFSET_REG : SIReg<"", 0>; + // VCC for 64-bit instructions def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, DwarfRegAlias<VCC_LO> { @@ -44,6 +51,11 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>, def SCC : SIReg<"scc", 253>; def M0 : SIReg <"m0", 124>; +def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>; +def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>; +def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>; +def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>; + // Trap handler registers def TBA_LO : SIReg<"tba_lo", 108>; def TBA_HI : SIReg<"tba_hi", 109>; @@ -128,7 +140,7 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> { // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "SGPR%u", 0, 103))> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. @@ -179,7 +191,7 @@ def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, (add (decimate (shl SGPR_32, 15), 4))]>; // Trap handler TMP 32-bit registers -def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32, +def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, (add (sequence "TTMP%u", 0, 11))> { let isAllocatable = 0; } @@ -197,7 +209,8 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], (add (decimate (shl TTMP_32, 3), 4))]>; // VGPR 32-bit registers -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +// i16/f16 only on VI+ +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; @@ -258,19 +271,20 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. -def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, - TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> { + TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, + SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> { let AllocationPriority = 7; } -def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { let AllocationPriority = 7; } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> { let AllocationPriority = 7; } @@ -307,7 +321,8 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R let isAllocatable = 0; } -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> { +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, + (add SGPR_128, TTMP_128)> { let AllocationPriority = 10; } @@ -319,7 +334,7 @@ def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> { let AllocationPriority = 11; } -def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> { +def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; let AllocationPriority = 12; @@ -366,7 +381,7 @@ def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { let Size = 32; } -def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add VGPR_32, SReg_32)> { let isAllocatable = 0; } @@ -417,6 +432,18 @@ multiclass SIRegOperand <string rc, string MatchName, string opType> { let OperandType = opType#"_FP64"; let ParserMatchClass = RegImmMatcher<MatchName#"F64">; } + + def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + let OperandType = opType#"_V2INT16"; + let ParserMatchClass = RegImmMatcher<MatchName#"V2B16">; + let DecoderMethod = "decodeOperand_VSrcV216"; + } + + def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + let OperandType = opType#"_V2FP16"; + let ParserMatchClass = RegImmMatcher<MatchName#"V2F16">; + let DecoderMethod = "decodeOperand_VSrcV216"; + } } } @@ -445,7 +472,9 @@ defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ; defm VSrc : RegImmOperand<"VS", "VSrc">; -def VSrc_128 : RegisterOperand<VReg_128>; +def VSrc_128 : RegisterOperand<VReg_128> { + let DecoderMethod = "DecodeVS_128RegisterClass"; +} //===----------------------------------------------------------------------===// // VSrc_* Operands with an VGPR diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td index be27966..0f02f58 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td @@ -53,6 +53,11 @@ class SISchedMachineModel : SchedMachineModel { let MicroOpBufferSize = 1; let IssueWidth = 1; let PostRAScheduler = 1; + + // FIXME:Approximate 2 * branch cost. Try to hack around bad + // early-ifcvt heuristics. These need improvement to avoid the OOE + // heuristics. + int MispredictPenalty = 20; } def SIFullSpeedModel : SISchedMachineModel; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index dd31dc6..874fbad 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -92,6 +92,8 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_SUBB_U32_e64: + if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm()) + return false; // Additional verification is needed for sdst/src2. return true; @@ -108,10 +110,8 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, } const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - const MachineOperand *Src1Mod = - TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); - - if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0))) + if (Src1 && (!isVGPR(Src1, TRI, MRI) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) return false; // We don't need to check src0, all input types are legal, so just make sure @@ -120,58 +120,64 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, return false; // Check output modifiers - if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) - return false; - - return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp); + return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp); } /// \brief This function checks \p MI for operands defined by a move immediate /// instruction and then folds the literal constant into the instruction if it -/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction -/// and will only fold literal constants if we are still in SSA. -static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, +/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. +static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, MachineRegisterInfo &MRI, bool TryToCommute = true) { - - if (!MRI.isSSA()) - return; - assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); - // Only one literal constant is allowed per instruction, so if src0 is a - // literal constant then we can't do any folding. - if (TII->isLiteralConstant(MI, Src0Idx)) - return; - // Try to fold Src0 MachineOperand &Src0 = MI.getOperand(Src0Idx); - if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) { + if (Src0.isReg()) { unsigned Reg = Src0.getReg(); - MachineInstr *Def = MRI.getUniqueVRegDef(Reg); - if (Def && Def->isMoveImmediate()) { - MachineOperand &MovSrc = Def->getOperand(1); - bool ConstantFolded = false; - - if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) || - isUInt<32>(MovSrc.getImm()))) { - Src0.ChangeToImmediate(MovSrc.getImm()); - ConstantFolded = true; - } - if (ConstantFolded) { - if (MRI.use_empty(Reg)) + if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) { + MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (Def && Def->isMoveImmediate()) { + MachineOperand &MovSrc = Def->getOperand(1); + bool ConstantFolded = false; + + if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) || + isUInt<32>(MovSrc.getImm()))) { + // It's possible to have only one component of a super-reg defined by + // a single mov, so we need to clear any subregister flag. + Src0.setSubReg(0); + Src0.ChangeToImmediate(MovSrc.getImm()); + ConstantFolded = true; + } else if (MovSrc.isFI()) { + Src0.setSubReg(0); + Src0.ChangeToFrameIndex(MovSrc.getIndex()); + ConstantFolded = true; + } + + if (ConstantFolded) { + assert(MRI.use_empty(Reg)); Def->eraseFromParent(); - ++NumLiteralConstantsFolded; - return; + ++NumLiteralConstantsFolded; + return true; + } } } } // We have failed to fold src0, so commute the instruction and try again. - if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI)) - foldImmediates(MI, TII, MRI, false); + if (TryToCommute && MI.isCommutable()) { + if (TII->commuteInstruction(MI)) { + if (foldImmediates(MI, TII, MRI, false)) + return true; + // Commute back. + TII->commuteInstruction(MI); + } + } + + return false; } // Copy MachineOperand with all flags except setting it as implicit. @@ -497,24 +503,24 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); if (Op32DstIdx != -1) { // dst - Inst32.addOperand(MI.getOperand(0)); + Inst32.add(MI.getOperand(0)); } else { assert(MI.getOperand(0).getReg() == AMDGPU::VCC && "Unexpected case"); } - Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); if (Src1) - Inst32.addOperand(*Src1); + Inst32.add(*Src1); if (Src2) { int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); if (Op32Src2Idx != -1) { - Inst32.addOperand(*Src2); + Inst32.add(*Src2); } else { // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is // replaced with an implicit read of vcc. This was already added diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp deleted file mode 100644 index aad6853..0000000 --- a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp +++ /dev/null @@ -1,156 +0,0 @@ -//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass removes performs the following type substitution on all -/// non-compute shaders: -/// -/// v16i8 => i128 -/// - v16i8 is used for constant memory resource descriptors. This type is -/// legal for some compute APIs, and we don't want to declare it as legal -/// in the backend, because we want the legalizer to expand all v16i8 -/// operations. -/// v1* => * -/// - Having v1* types complicates the legalizer and we can easily replace -/// - them with the element type. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" - -using namespace llvm; - -namespace { - -class SITypeRewriter : public FunctionPass, - public InstVisitor<SITypeRewriter> { - - static char ID; - Module *Mod; - Type *v16i8; - Type *v4i32; - -public: - SITypeRewriter() : FunctionPass(ID) { } - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - StringRef getPassName() const override { return "SI Type Rewriter"; } - void visitLoadInst(LoadInst &I); - void visitCallInst(CallInst &I); - void visitBitCast(BitCastInst &I); -}; - -} // End anonymous namespace - -char SITypeRewriter::ID = 0; - -bool SITypeRewriter::doInitialization(Module &M) { - Mod = &M; - v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16); - v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4); - return false; -} - -bool SITypeRewriter::runOnFunction(Function &F) { - if (!AMDGPU::isShader(F.getCallingConv())) - return false; - - visit(F); - visit(F); - - return false; -} - -void SITypeRewriter::visitLoadInst(LoadInst &I) { - Value *Ptr = I.getPointerOperand(); - Type *PtrTy = Ptr->getType(); - Type *ElemTy = PtrTy->getPointerElementType(); - IRBuilder<> Builder(&I); - if (ElemTy == v16i8) { - Value *BitCast = Builder.CreateBitCast(Ptr, - PointerType::get(v4i32,PtrTy->getPointerAddressSpace())); - LoadInst *Load = Builder.CreateLoad(BitCast); - SmallVector<std::pair<unsigned, MDNode *>, 8> MD; - I.getAllMetadataOtherThanDebugLoc(MD); - for (unsigned i = 0, e = MD.size(); i != e; ++i) { - Load->setMetadata(MD[i].first, MD[i].second); - } - Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType()); - I.replaceAllUsesWith(BitCastLoad); - I.eraseFromParent(); - } -} - -void SITypeRewriter::visitCallInst(CallInst &I) { - IRBuilder<> Builder(&I); - - SmallVector <Value*, 8> Args; - SmallVector <Type*, 8> Types; - bool NeedToReplace = false; - Function *F = I.getCalledFunction(); - if (!F) - return; - - std::string Name = F->getName(); - for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { - Value *Arg = I.getArgOperand(i); - if (Arg->getType() == v16i8) { - Args.push_back(Builder.CreateBitCast(Arg, v4i32)); - Types.push_back(v4i32); - NeedToReplace = true; - Name = Name + ".v4i32"; - } else if (Arg->getType()->isVectorTy() && - Arg->getType()->getVectorNumElements() == 1 && - Arg->getType()->getVectorElementType() == - Type::getInt32Ty(I.getContext())){ - Type *ElementTy = Arg->getType()->getVectorElementType(); - std::string TypeName = "i32"; - InsertElementInst *Def = cast<InsertElementInst>(Arg); - Args.push_back(Def->getOperand(1)); - Types.push_back(ElementTy); - std::string VecTypeName = "v1" + TypeName; - Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName); - NeedToReplace = true; - } else { - Args.push_back(Arg); - Types.push_back(Arg->getType()); - } - } - - if (!NeedToReplace) { - return; - } - Function *NewF = Mod->getFunction(Name); - if (!NewF) { - NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod); - NewF->setAttributes(F->getAttributes()); - } - I.replaceAllUsesWith(Builder.CreateCall(NewF, Args)); - I.eraseFromParent(); -} - -void SITypeRewriter::visitBitCast(BitCastInst &I) { - IRBuilder<> Builder(&I); - if (I.getDestTy() != v4i32) { - return; - } - - if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) { - if (Op->getSrcTy() == v4i32) { - I.replaceAllUsesWith(Op->getOperand(0)); - I.eraseFromParent(); - } - } -} - -FunctionPass *llvm::createSITypeRewriter() { - return new SITypeRewriter(); -} diff --git a/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td index 0265648..73dd8b7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -226,9 +226,10 @@ def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime> def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ auto Ld = cast<LoadSDNode>(N); return Ld->getAlignment() >= 4 && - ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) || - (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && + !Ld->isVolatile() && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N))); }]>; @@ -293,12 +294,6 @@ def : Pat < let Predicates = [isVI] in { -// 1. Offset as 20bit DWORD immediate -def : Pat < - (SIload_constant v4i32:$sbase, IMM20bit:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset), 0) ->; - def : Pat < (i64 (readcyclecounter)), (S_MEMREALTIME) diff --git a/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td index 73cd577..ec29a66 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -82,6 +82,12 @@ class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo < let has_sdst = 0; } +class SOP1_0_32R <string opName, list<dag> pattern = []> : SOP1_Pseudo < + opName, (outs), (ins SReg_32:$src0), + "$src0", pattern> { + let has_sdst = 0; +} + class SOP1_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0), "$sdst, $src0", pattern @@ -178,13 +184,27 @@ def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">; def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">; def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">; def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">; -def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">; +def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64", + [(set i64:$sdst, (int_amdgcn_s_getpc))] +>; -let isTerminator = 1, isBarrier = 1, - isBranch = 1, isIndirectBranch = 1 in { +let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in { + +let isBranch = 1, isIndirectBranch = 1 in { def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">; +} // End isBranch = 1, isIndirectBranch = 1 + +let isReturn = 1 in { +// Define variant marked as return rather than branch. +def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>; +} +} // End isTerminator = 1, isBarrier = 1 + +let isCall = 1 in { +def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64" +>; } -def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">; + def S_RFE_B64 : SOP1_1 <"s_rfe_b64">; let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { @@ -210,7 +230,7 @@ def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">; def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">; } // End Uses = [M0] -def S_CBRANCH_JOIN : SOP1_1 <"s_cbranch_join">; +def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">; def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">; let Defs = [SCC] in { def S_ABS_I32 : SOP1_32 <"s_abs_i32">; @@ -428,7 +448,7 @@ def S_BFE_I64 : SOP2_64_32 <"s_bfe_i64">; def S_CBRANCH_G_FORK : SOP2_Pseudo < "s_cbranch_g_fork", (outs), - (ins SReg_64:$src0, SReg_64:$src1), + (ins SCSrc_b64:$src0, SCSrc_b64:$src1), "$src0, $src1" > { let has_sdst = 0; @@ -438,6 +458,22 @@ let Defs = [SCC] in { def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">; } // End Defs = [SCC] +let SubtargetPredicate = isVI in { + def S_RFE_RESTORE_B64 : SOP2_Pseudo < + "s_rfe_restore_b64", (outs), + (ins SSrc_b64:$src0, SSrc_b32:$src1), + "$src0, $src1" + > { + let hasSideEffects = 1; + let has_sdst = 0; + } +} + +let SubtargetPredicate = isGFX9 in { + def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">; + def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">; + def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">; +} //===----------------------------------------------------------------------===// // SOPK Instructions @@ -508,14 +544,16 @@ class SOPKInstTable <bit is_sopk, string cmpOp = ""> { class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo < opName, (outs SReg_32:$sdst), - (ins u16imm:$simm16), + (ins s16imm:$simm16), "$sdst, $simm16", pattern>; -class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo < +class SOPK_SCC <string opName, string base_op, bit isSignExt> : SOPK_Pseudo < opName, (outs), - (ins SReg_32:$sdst, u16imm:$simm16), + !if(isSignExt, + (ins SReg_32:$sdst, s16imm:$simm16), + (ins SReg_32:$sdst, u16imm:$simm16)), "$sdst, $simm16", []>, SOPKInstTable<1, base_op>{ let Defs = [SCC]; @@ -524,7 +562,7 @@ class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo < class SOPK_32TIE <string opName, list<dag> pattern=[]> : SOPK_Pseudo < opName, (outs SReg_32:$sdst), - (ins SReg_32:$src0, u16imm:$simm16), + (ins SReg_32:$src0, s16imm:$simm16), "$sdst, $simm16", pattern >; @@ -553,20 +591,20 @@ let isCompare = 1 in { // [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] // >; -def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32">; -def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32">; -def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32">; -def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32">; -def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32">; -def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32">; +def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32", 1>; +def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32", 1>; +def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32", 1>; +def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32", 1>; +def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32", 1>; +def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32", 1>; let SOPKZext = 1 in { -def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32">; -def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32">; -def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32">; -def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32">; -def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32">; -def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32">; +def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32", 0>; +def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32", 0>; +def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32", 0>; +def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32", 0>; +def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32", 0>; +def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32", 0>; } // End SOPKZext = 1 } // End isCompare = 1 @@ -578,7 +616,7 @@ let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", def S_CBRANCH_I_FORK : SOPK_Pseudo < "s_cbranch_i_fork", - (outs), (ins SReg_64:$sdst, u16imm:$simm16), + (outs), (ins SReg_64:$sdst, s16imm:$simm16), "$sdst, $simm16" >; @@ -751,6 +789,14 @@ def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", let isReturn = 1; } +let SubtargetPredicate = isVI in { +def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> { + let simm16 = 0; + let isBarrier = 1; + let isReturn = 1; +} +} + let isBranch = 1, SchedRW = [WriteBranch] in { def S_BRANCH : SOPP < 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", @@ -792,6 +838,25 @@ def S_CBRANCH_EXECNZ : SOPP < >; } // End Uses = [EXEC] +def S_CBRANCH_CDBGSYS : SOPP < + 0x00000017, (ins sopp_brtarget:$simm16), + "s_cbranch_cdbgsys $simm16" +>; + +def S_CBRANCH_CDBGSYS_AND_USER : SOPP < + 0x0000001A, (ins sopp_brtarget:$simm16), + "s_cbranch_cdbgsys_and_user $simm16" +>; + +def S_CBRANCH_CDBGSYS_OR_USER : SOPP < + 0x00000019, (ins sopp_brtarget:$simm16), + "s_cbranch_cdbgsys_or_user $simm16" +>; + +def S_CBRANCH_CDBGUSER : SOPP < + 0x00000018, (ins sopp_brtarget:$simm16), + "s_cbranch_cdbguser $simm16" +>; } // End isBranch = 1 } // End isTerminator = 1 @@ -806,9 +871,18 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", let isConvergent = 1; } +let SubtargetPredicate = isVI in { +def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> { + let simm16 = 0; + let mayLoad = 1; + let mayStore = 1; +} +} + let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; +def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">; // On SI the documentation says sleep for approximately 64 * low 2 // bits, consistent with the reported maximum of 448. On VI the @@ -1207,6 +1281,10 @@ def S_BFE_U64_vi : SOP2_Real_vi <0x27, S_BFE_U64>; def S_BFE_I64_vi : SOP2_Real_vi <0x28, S_BFE_I64>; def S_CBRANCH_G_FORK_vi : SOP2_Real_vi <0x29, S_CBRANCH_G_FORK>; def S_ABSDIFF_I32_vi : SOP2_Real_vi <0x2a, S_ABSDIFF_I32>; +def S_PACK_LL_B32_B16_vi : SOP2_Real_vi <0x32, S_PACK_LL_B32_B16>; +def S_PACK_LH_B32_B16_vi : SOP2_Real_vi <0x33, S_PACK_LH_B32_B16>; +def S_PACK_HH_B32_B16_vi : SOP2_Real_vi <0x34, S_PACK_HH_B32_B16>; +def S_RFE_RESTORE_B64_vi : SOP2_Real_vi <0x2b, S_RFE_RESTORE_B64>; def S_MOVK_I32_vi : SOPK_Real_vi <0x00, S_MOVK_I32>; def S_CMOVK_I32_vi : SOPK_Real_vi <0x01, S_CMOVK_I32>; diff --git a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp index 9908fc0..92fb762 100644 --- a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp @@ -16,7 +16,7 @@ using namespace llvm; -/// \brief The target which suports all AMD GPUs. This will eventually +/// \brief The target which supports all AMD GPUs. This will eventually /// be deprecated and there will be a R600 target and a GCN target. Target &llvm::getTheAMDGPUTarget() { static Target TheAMDGPUTarget; diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index b6868de..03b11ae 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -65,5 +65,18 @@ const char* const IdSymbolic[] = { }; } // namespace Hwreg + +namespace Swizzle { + +// This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h. +const char* const IdSymbolic[] = { + "QUAD_PERM", + "BITMASK_PERM", + "SWAP", + "REVERSE", + "BROADCAST", +}; + +} // namespace Swizzle } // namespace AMDGPU } // namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index b2dc2c0..ebb2be2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -25,6 +25,12 @@ namespace Hwreg { // Symbolic names for the hwreg(...) syntax. extern const char* const IdSymbolic[]; } // namespace Hwreg + +namespace Swizzle { // Symbolic names for the swizzle(...) syntax. + +extern const char* const IdSymbolic[]; + +} // namespace Swizzle } // namespace AMDGPU } // namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 5f651d4..67ad904 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information--------------===// +//===- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information --------------===// // // The LLVM Compiler Infrastructure // @@ -6,32 +6,41 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// + #include "AMDGPUBaseInfo.h" #include "AMDGPU.h" #include "SIDefines.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/SubtargetFeature.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> +#include <utility> -#define GET_SUBTARGETINFO_ENUM -#include "AMDGPUGenSubtargetInfo.inc" -#undef GET_SUBTARGETINFO_ENUM - -#define GET_REGINFO_ENUM -#include "AMDGPUGenRegisterInfo.inc" -#undef GET_REGINFO_ENUM +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #define GET_INSTRINFO_NAMED_OPS -#define GET_INSTRINFO_ENUM #include "AMDGPUGenInstrInfo.inc" #undef GET_INSTRINFO_NAMED_OPS -#undef GET_INSTRINFO_ENUM namespace { @@ -56,11 +65,11 @@ unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) { return (Src & getBitMask(Shift, Width)) >> Shift; } -/// \returns Vmcnt bit shift. -unsigned getVmcntBitShift() { return 0; } +/// \returns Vmcnt bit shift (lower bits). +unsigned getVmcntBitShiftLo() { return 0; } -/// \returns Vmcnt bit width. -unsigned getVmcntBitWidth() { return 4; } +/// \returns Vmcnt bit width (lower bits). +unsigned getVmcntBitWidthLo() { return 4; } /// \returns Expcnt bit shift. unsigned getExpcntBitShift() { return 4; } @@ -74,52 +83,241 @@ unsigned getLgkmcntBitShift() { return 8; } /// \returns Lgkmcnt bit width. unsigned getLgkmcntBitWidth() { return 4; } -} // anonymous namespace +/// \returns Vmcnt bit shift (higher bits). +unsigned getVmcntBitShiftHi() { return 14; } + +/// \returns Vmcnt bit width (higher bits). +unsigned getVmcntBitWidthHi() { return 2; } + +} // end namespace anonymous namespace llvm { + +static cl::opt<bool> EnablePackedInlinableLiterals( + "enable-packed-inlinable-literals", + cl::desc("Enable packed inlinable literals (v2f16, v2i16)"), + cl::init(false)); + namespace AMDGPU { -IsaVersion getIsaVersion(const FeatureBitset &Features) { +namespace IsaInfo { +IsaVersion getIsaVersion(const FeatureBitset &Features) { + // SI. + if (Features.test(FeatureISAVersion6_0_0)) + return {6, 0, 0}; + if (Features.test(FeatureISAVersion6_0_1)) + return {6, 0, 1}; + // CI. if (Features.test(FeatureISAVersion7_0_0)) return {7, 0, 0}; - if (Features.test(FeatureISAVersion7_0_1)) return {7, 0, 1}; - if (Features.test(FeatureISAVersion7_0_2)) return {7, 0, 2}; + if (Features.test(FeatureISAVersion7_0_3)) + return {7, 0, 3}; + // VI. if (Features.test(FeatureISAVersion8_0_0)) return {8, 0, 0}; - if (Features.test(FeatureISAVersion8_0_1)) return {8, 0, 1}; - if (Features.test(FeatureISAVersion8_0_2)) return {8, 0, 2}; - if (Features.test(FeatureISAVersion8_0_3)) return {8, 0, 3}; - if (Features.test(FeatureISAVersion8_0_4)) return {8, 0, 4}; - if (Features.test(FeatureISAVersion8_1_0)) return {8, 1, 0}; - return {0, 0, 0}; + // GFX9. + if (Features.test(FeatureISAVersion9_0_0)) + return {9, 0, 0}; + if (Features.test(FeatureISAVersion9_0_1)) + return {9, 0, 1}; + if (Features.test(FeatureISAVersion9_0_2)) + return {9, 0, 2}; + if (Features.test(FeatureISAVersion9_0_3)) + return {9, 0, 3}; + + if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands)) + return {0, 0, 0}; + return {7, 0, 0}; +} + +unsigned getWavefrontSize(const FeatureBitset &Features) { + if (Features.test(FeatureWavefrontSize16)) + return 16; + if (Features.test(FeatureWavefrontSize32)) + return 32; + + return 64; +} + +unsigned getLocalMemorySize(const FeatureBitset &Features) { + if (Features.test(FeatureLocalMemorySize32768)) + return 32768; + if (Features.test(FeatureLocalMemorySize65536)) + return 65536; + + return 0; +} + +unsigned getEUsPerCU(const FeatureBitset &Features) { + return 4; +} + +unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize) { + if (!Features.test(FeatureGCN)) + return 8; + unsigned N = getWavesPerWorkGroup(Features, FlatWorkGroupSize); + if (N == 1) + return 40; + N = 40 / N; + return std::min(N, 16u); +} + +unsigned getMaxWavesPerCU(const FeatureBitset &Features) { + return getMaxWavesPerEU(Features) * getEUsPerCU(Features); +} + +unsigned getMaxWavesPerCU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize) { + return getWavesPerWorkGroup(Features, FlatWorkGroupSize); +} + +unsigned getMinWavesPerEU(const FeatureBitset &Features) { + return 1; +} + +unsigned getMaxWavesPerEU(const FeatureBitset &Features) { + if (!Features.test(FeatureGCN)) + return 8; + // FIXME: Need to take scratch memory into account. + return 10; } +unsigned getMaxWavesPerEU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize) { + return alignTo(getMaxWavesPerCU(Features, FlatWorkGroupSize), + getEUsPerCU(Features)) / getEUsPerCU(Features); +} + +unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features) { + return 1; +} + +unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features) { + return 2048; +} + +unsigned getWavesPerWorkGroup(const FeatureBitset &Features, + unsigned FlatWorkGroupSize) { + return alignTo(FlatWorkGroupSize, getWavefrontSize(Features)) / + getWavefrontSize(Features); +} + +unsigned getSGPRAllocGranule(const FeatureBitset &Features) { + IsaVersion Version = getIsaVersion(Features); + if (Version.Major >= 8) + return 16; + return 8; +} + +unsigned getSGPREncodingGranule(const FeatureBitset &Features) { + return 8; +} + +unsigned getTotalNumSGPRs(const FeatureBitset &Features) { + IsaVersion Version = getIsaVersion(Features); + if (Version.Major >= 8) + return 800; + return 512; +} + +unsigned getAddressableNumSGPRs(const FeatureBitset &Features) { + if (Features.test(FeatureSGPRInitBug)) + return FIXED_NUM_SGPRS_FOR_INIT_BUG; + + IsaVersion Version = getIsaVersion(Features); + if (Version.Major >= 8) + return 102; + return 104; +} + +unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { + assert(WavesPerEU != 0); + + if (WavesPerEU >= getMaxWavesPerEU(Features)) + return 0; + unsigned MinNumSGPRs = + alignDown(getTotalNumSGPRs(Features) / (WavesPerEU + 1), + getSGPRAllocGranule(Features)) + 1; + return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features)); +} + +unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU, + bool Addressable) { + assert(WavesPerEU != 0); + + IsaVersion Version = getIsaVersion(Features); + unsigned MaxNumSGPRs = alignDown(getTotalNumSGPRs(Features) / WavesPerEU, + getSGPRAllocGranule(Features)); + unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features); + if (Version.Major >= 8 && !Addressable) + AddressableNumSGPRs = 112; + return std::min(MaxNumSGPRs, AddressableNumSGPRs); +} + +unsigned getVGPRAllocGranule(const FeatureBitset &Features) { + return 4; +} + +unsigned getVGPREncodingGranule(const FeatureBitset &Features) { + return getVGPRAllocGranule(Features); +} + +unsigned getTotalNumVGPRs(const FeatureBitset &Features) { + return 256; +} + +unsigned getAddressableNumVGPRs(const FeatureBitset &Features) { + return getTotalNumVGPRs(Features); +} + +unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { + assert(WavesPerEU != 0); + + if (WavesPerEU >= getMaxWavesPerEU(Features)) + return 0; + unsigned MinNumVGPRs = + alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1), + getVGPRAllocGranule(Features)) + 1; + return std::min(MinNumVGPRs, getAddressableNumVGPRs(Features)); +} + +unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { + assert(WavesPerEU != 0); + + unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(Features) / WavesPerEU, + getVGPRAllocGranule(Features)); + unsigned AddressableNumVGPRs = getAddressableNumVGPRs(Features); + return std::min(MaxNumVGPRs, AddressableNumVGPRs); +} + +} // end namespace IsaInfo + void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const FeatureBitset &Features) { - - IsaVersion ISA = getIsaVersion(Features); + IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(Features); memset(&Header, 0, sizeof(Header)); Header.amd_kernel_code_version_major = 1; - Header.amd_kernel_code_version_minor = 0; + Header.amd_kernel_code_version_minor = 1; Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU Header.amd_machine_version_major = ISA.Major; Header.amd_machine_version_minor = ISA.Minor; @@ -127,6 +325,11 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.kernel_code_entry_byte_offset = sizeof(Header); // wavefront_size is specified as a power of 2: 2^6 = 64 threads. Header.wavefront_size = 6; + + // If the code object does not support indirect functions, then the value must + // be 0xffffffff. + Header.call_convention = -1; + // These alignment values are specified in powers of two, so alignment = // 2^n. The minimum alignment is 2^4 = 16. Header.kernarg_segment_alignment = 4; @@ -134,43 +337,16 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.private_segment_alignment = 4; } -MCSection *getHSATextSection(MCContext &Ctx) { - return Ctx.getELFSection(".hsatext", ELF::SHT_PROGBITS, - ELF::SHF_ALLOC | ELF::SHF_WRITE | - ELF::SHF_EXECINSTR | - ELF::SHF_AMDGPU_HSA_AGENT | - ELF::SHF_AMDGPU_HSA_CODE); +bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS) { + return GV->getType()->getAddressSpace() == AS.LOCAL_ADDRESS; } -MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) { - return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS, - ELF::SHF_ALLOC | ELF::SHF_WRITE | - ELF::SHF_AMDGPU_HSA_GLOBAL | - ELF::SHF_AMDGPU_HSA_AGENT); +bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS) { + return GV->getType()->getAddressSpace() == AS.GLOBAL_ADDRESS; } -MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) { - return Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS, - ELF::SHF_ALLOC | ELF::SHF_WRITE | - ELF::SHF_AMDGPU_HSA_GLOBAL); -} - -MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) { - return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS, - ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY | - ELF::SHF_AMDGPU_HSA_AGENT); -} - -bool isGroupSegment(const GlobalValue *GV) { - return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -} - -bool isGlobalSegment(const GlobalValue *GV) { - return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; -} - -bool isReadOnlySegment(const GlobalValue *GV) { - return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; +bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS) { + return GV->getType()->getAddressSpace() == AS.CONSTANT_ADDRESS; } bool shouldEmitConstantsToTextSection(const Triple &TT) { @@ -208,7 +384,7 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F, return Default; } if (Strs.second.trim().getAsInteger(0, Ints.second)) { - if (!OnlyFirstRequired || Strs.second.trim().size()) { + if (!OnlyFirstRequired || !Strs.second.trim().empty()) { Ctx.emitError("can't parse second integer attribute " + Name); return Default; } @@ -217,57 +393,84 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F, return Ints; } -unsigned getWaitcntBitMask(IsaVersion Version) { - unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth()); - unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth()); - unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth()); - return Vmcnt | Expcnt | Lgkmcnt; -} +unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) { + unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1; + if (Version.Major < 9) + return VmcntLo; -unsigned getVmcntBitMask(IsaVersion Version) { - return (1 << getVmcntBitWidth()) - 1; + unsigned VmcntHi = ((1 << getVmcntBitWidthHi()) - 1) << getVmcntBitWidthLo(); + return VmcntLo | VmcntHi; } -unsigned getExpcntBitMask(IsaVersion Version) { +unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version) { return (1 << getExpcntBitWidth()) - 1; } -unsigned getLgkmcntBitMask(IsaVersion Version) { +unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version) { return (1 << getLgkmcntBitWidth()) - 1; } -unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt) { - return unpackBits(Waitcnt, getVmcntBitShift(), getVmcntBitWidth()); +unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) { + unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo()); + unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth()); + unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth()); + unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt; + if (Version.Major < 9) + return Waitcnt; + + unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(), getVmcntBitWidthHi()); + return Waitcnt | VmcntHi; +} + +unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { + unsigned VmcntLo = + unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo()); + if (Version.Major < 9) + return VmcntLo; + + unsigned VmcntHi = + unpackBits(Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi()); + VmcntHi <<= getVmcntBitWidthLo(); + return VmcntLo | VmcntHi; } -unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt) { +unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth()); } -unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt) { +unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); } -void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt, +void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) { Vmcnt = decodeVmcnt(Version, Waitcnt); Expcnt = decodeExpcnt(Version, Waitcnt); Lgkmcnt = decodeLgkmcnt(Version, Waitcnt); } -unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt) { - return packBits(Vmcnt, Waitcnt, getVmcntBitShift(), getVmcntBitWidth()); +unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Vmcnt) { + Waitcnt = + packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo()); + if (Version.Major < 9) + return Waitcnt; + + Vmcnt >>= getVmcntBitWidthLo(); + return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi()); } -unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt) { +unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Expcnt) { return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth()); } -unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) { +unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Lgkmcnt) { return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); } -unsigned encodeWaitcnt(IsaVersion Version, +unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) { unsigned Waitcnt = getWaitcntBitMask(Version); Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt); @@ -283,6 +486,7 @@ unsigned getInitialPSInputAddr(const Function &F) { bool isShader(CallingConv::ID cc) { switch(cc) { case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_HS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: @@ -296,6 +500,21 @@ bool isCompute(CallingConv::ID cc) { return !isShader(cc) || cc == CallingConv::AMDGPU_CS; } +bool isEntryFunctionCC(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_HS: + return true; + default: + return false; + } +} + bool isSI(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands]; } @@ -308,6 +527,24 @@ bool isVI(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; } +bool isGFX9(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; +} + +bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { + const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); + const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); + return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || + Reg == AMDGPU::SCC; +} + +bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { + for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) { + if (*R == Reg1) return true; + } + return false; +} + unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { switch(Reg) { @@ -327,13 +564,34 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { return Reg; } +unsigned mc2PseudoReg(unsigned Reg) { + switch (Reg) { + case AMDGPU::FLAT_SCR_ci: + case AMDGPU::FLAT_SCR_vi: + return FLAT_SCR; + + case AMDGPU::FLAT_SCR_LO_ci: + case AMDGPU::FLAT_SCR_LO_vi: + return AMDGPU::FLAT_SCR_LO; + + case AMDGPU::FLAT_SCR_HI_ci: + case AMDGPU::FLAT_SCR_HI_vi: + return AMDGPU::FLAT_SCR_HI; + + default: + return Reg; + } +} + bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { + assert(OpNo < Desc.NumOperands); unsigned OpType = Desc.OpInfo[OpNo].OperandType; return OpType >= AMDGPU::OPERAND_SRC_FIRST && OpType <= AMDGPU::OPERAND_SRC_LAST; } bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { + assert(OpNo < Desc.NumOperands); unsigned OpType = Desc.OpInfo[OpNo].OperandType; switch (OpType) { case AMDGPU::OPERAND_REG_IMM_FP32: @@ -342,6 +600,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: return true; default: return false; @@ -349,6 +608,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { } bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) { + assert(OpNo < Desc.NumOperands); unsigned OpType = Desc.OpInfo[OpNo].OperandType; return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST && OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST; @@ -392,6 +652,7 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) { unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, unsigned OpNo) { + assert(OpNo < Desc.NumOperands); unsigned RCID = Desc.OpInfo[OpNo].RegClass; return getRegBitWidth(MRI->getRegClass(RCID)) / 8; } @@ -440,7 +701,8 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) { } bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { - assert(HasInv2Pi); + if (!HasInv2Pi) + return false; if (Literal >= -16 && Literal <= 64) return true; @@ -457,5 +719,95 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { Val == 0x3118; // 1/2pi } -} // End namespace AMDGPU -} // End namespace llvm +bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { + assert(HasInv2Pi); + + if (!EnablePackedInlinableLiterals) + return false; + + int16_t Lo16 = static_cast<int16_t>(Literal); + int16_t Hi16 = static_cast<int16_t>(Literal >> 16); + return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); +} + +bool isUniformMMO(const MachineMemOperand *MMO) { + const Value *Ptr = MMO->getValue(); + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers. + // If Ptr is null, then that means this mem operand contains a + // PseudoSourceValue like GOT. + if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || + isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) + return true; + + const Instruction *I = dyn_cast<Instruction>(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} + +int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { + if (isSI(ST) || isCI(ST)) + return ByteOffset >> 2; + + return ByteOffset; +} + +bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { + int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset); + return isSI(ST) || isCI(ST) ? isUInt<8>(EncodedOffset) : + isUInt<20>(EncodedOffset); +} +} // end namespace AMDGPU + +} // end namespace llvm + +const unsigned AMDGPUAS::MAX_COMMON_ADDRESS; +const unsigned AMDGPUAS::GLOBAL_ADDRESS; +const unsigned AMDGPUAS::LOCAL_ADDRESS; +const unsigned AMDGPUAS::PARAM_D_ADDRESS; +const unsigned AMDGPUAS::PARAM_I_ADDRESS; +const unsigned AMDGPUAS::CONSTANT_BUFFER_0; +const unsigned AMDGPUAS::CONSTANT_BUFFER_1; +const unsigned AMDGPUAS::CONSTANT_BUFFER_2; +const unsigned AMDGPUAS::CONSTANT_BUFFER_3; +const unsigned AMDGPUAS::CONSTANT_BUFFER_4; +const unsigned AMDGPUAS::CONSTANT_BUFFER_5; +const unsigned AMDGPUAS::CONSTANT_BUFFER_6; +const unsigned AMDGPUAS::CONSTANT_BUFFER_7; +const unsigned AMDGPUAS::CONSTANT_BUFFER_8; +const unsigned AMDGPUAS::CONSTANT_BUFFER_9; +const unsigned AMDGPUAS::CONSTANT_BUFFER_10; +const unsigned AMDGPUAS::CONSTANT_BUFFER_11; +const unsigned AMDGPUAS::CONSTANT_BUFFER_12; +const unsigned AMDGPUAS::CONSTANT_BUFFER_13; +const unsigned AMDGPUAS::CONSTANT_BUFFER_14; +const unsigned AMDGPUAS::CONSTANT_BUFFER_15; +const unsigned AMDGPUAS::UNKNOWN_ADDRESS_SPACE; + +namespace llvm { +namespace AMDGPU { + +AMDGPUAS getAMDGPUAS(Triple T) { + auto Env = T.getEnvironmentName(); + AMDGPUAS AS; + if (Env == "amdgiz" || Env == "amdgizcl") { + AS.FLAT_ADDRESS = 0; + AS.PRIVATE_ADDRESS = 5; + AS.REGION_ADDRESS = 4; + } + else { + AS.FLAT_ADDRESS = 4; + AS.PRIVATE_ADDRESS = 0; + AS.REGION_ADDRESS = 5; + } + return AS; +} + +AMDGPUAS getAMDGPUAS(const TargetMachine &M) { + return getAMDGPUAS(M.getTargetTriple()); +} + +AMDGPUAS getAMDGPUAS(const Module &M) { + return getAMDGPUAS(Triple(M.getTargetTriple())); +} +} // namespace AMDGPU +} // namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index ea5fc36..936e492 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1,4 +1,4 @@ -//===-- AMDGPUBaseInfo.h - Top level definitions for AMDGPU -----*- C++ -*-===// +//===- AMDGPUBaseInfo.h - Top level definitions for AMDGPU ------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -10,52 +10,149 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H +#include "AMDGPU.h" #include "AMDKernelCodeT.h" -#include "llvm/IR/CallingConv.h" - #include "SIDefines.h" - -#define GET_INSTRINFO_OPERAND_ENUM -#include "AMDGPUGenInstrInfo.inc" -#undef GET_INSTRINFO_OPERAND_ENUM +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include <cstdint> +#include <utility> namespace llvm { class FeatureBitset; class Function; class GlobalValue; +class MachineMemOperand; class MCContext; -class MCInstrDesc; class MCRegisterClass; class MCRegisterInfo; class MCSection; class MCSubtargetInfo; +class Triple; namespace AMDGPU { +namespace IsaInfo { -LLVM_READONLY -int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); +enum { + // The closed Vulkan driver sets 96, which limits the wave count to 8 but + // doesn't spill SGPRs as much as when 80 is set. + FIXED_NUM_SGPRS_FOR_INIT_BUG = 96 +}; +/// \brief Instruction set architecture version. struct IsaVersion { unsigned Major; unsigned Minor; unsigned Stepping; }; +/// \returns Isa version for given subtarget \p Features. IsaVersion getIsaVersion(const FeatureBitset &Features); -void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, - const FeatureBitset &Features); -MCSection *getHSATextSection(MCContext &Ctx); -MCSection *getHSADataGlobalAgentSection(MCContext &Ctx); +/// \returns Wavefront size for given subtarget \p Features. +unsigned getWavefrontSize(const FeatureBitset &Features); + +/// \returns Local memory size in bytes for given subtarget \p Features. +unsigned getLocalMemorySize(const FeatureBitset &Features); + +/// \returns Number of execution units per compute unit for given subtarget \p +/// Features. +unsigned getEUsPerCU(const FeatureBitset &Features); + +/// \returns Maximum number of work groups per compute unit for given subtarget +/// \p Features and limited by given \p FlatWorkGroupSize. +unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize); + +/// \returns Maximum number of waves per compute unit for given subtarget \p +/// Features without any kind of limitation. +unsigned getMaxWavesPerCU(const FeatureBitset &Features); + +/// \returns Maximum number of waves per compute unit for given subtarget \p +/// Features and limited by given \p FlatWorkGroupSize. +unsigned getMaxWavesPerCU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize); + +/// \returns Minimum number of waves per execution unit for given subtarget \p +/// Features. +unsigned getMinWavesPerEU(const FeatureBitset &Features); + +/// \returns Maximum number of waves per execution unit for given subtarget \p +/// Features without any kind of limitation. +unsigned getMaxWavesPerEU(const FeatureBitset &Features); + +/// \returns Maximum number of waves per execution unit for given subtarget \p +/// Features and limited by given \p FlatWorkGroupSize. +unsigned getMaxWavesPerEU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize); + +/// \returns Minimum flat work group size for given subtarget \p Features. +unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features); + +/// \returns Maximum flat work group size for given subtarget \p Features. +unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features); + +/// \returns Number of waves per work group for given subtarget \p Features and +/// limited by given \p FlatWorkGroupSize. +unsigned getWavesPerWorkGroup(const FeatureBitset &Features, + unsigned FlatWorkGroupSize); + +/// \returns SGPR allocation granularity for given subtarget \p Features. +unsigned getSGPRAllocGranule(const FeatureBitset &Features); + +/// \returns SGPR encoding granularity for given subtarget \p Features. +unsigned getSGPREncodingGranule(const FeatureBitset &Features); + +/// \returns Total number of SGPRs for given subtarget \p Features. +unsigned getTotalNumSGPRs(const FeatureBitset &Features); + +/// \returns Addressable number of SGPRs for given subtarget \p Features. +unsigned getAddressableNumSGPRs(const FeatureBitset &Features); + +/// \returns Minimum number of SGPRs that meets the given number of waves per +/// execution unit requirement for given subtarget \p Features. +unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU); + +/// \returns Maximum number of SGPRs that meets the given number of waves per +/// execution unit requirement for given subtarget \p Features. +unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU, + bool Addressable); + +/// \returns VGPR allocation granularity for given subtarget \p Features. +unsigned getVGPRAllocGranule(const FeatureBitset &Features); + +/// \returns VGPR encoding granularity for given subtarget \p Features. +unsigned getVGPREncodingGranule(const FeatureBitset &Features); + +/// \returns Total number of VGPRs for given subtarget \p Features. +unsigned getTotalNumVGPRs(const FeatureBitset &Features); + +/// \returns Addressable number of VGPRs for given subtarget \p Features. +unsigned getAddressableNumVGPRs(const FeatureBitset &Features); -MCSection *getHSADataGlobalProgramSection(MCContext &Ctx); +/// \returns Minimum number of VGPRs that meets given number of waves per +/// execution unit requirement for given subtarget \p Features. +unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU); -MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx); +/// \returns Maximum number of VGPRs that meets given number of waves per +/// execution unit requirement for given subtarget \p Features. +unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU); -bool isGroupSegment(const GlobalValue *GV); -bool isGlobalSegment(const GlobalValue *GV); -bool isReadOnlySegment(const GlobalValue *GV); +} // end namespace IsaInfo + +LLVM_READONLY +int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); + +void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, + const FeatureBitset &Features); + +bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS); +bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS); +bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS); /// \returns True if constants should be emitted to .text section for given /// target triple \p TT, false otherwise. @@ -83,73 +180,108 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F, std::pair<int, int> Default, bool OnlyFirstRequired = false); -/// \returns Waitcnt bit mask for given isa \p Version. -unsigned getWaitcntBitMask(IsaVersion Version); - /// \returns Vmcnt bit mask for given isa \p Version. -unsigned getVmcntBitMask(IsaVersion Version); +unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version); /// \returns Expcnt bit mask for given isa \p Version. -unsigned getExpcntBitMask(IsaVersion Version); +unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version); /// \returns Lgkmcnt bit mask for given isa \p Version. -unsigned getLgkmcntBitMask(IsaVersion Version); +unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version); + +/// \returns Waitcnt bit mask for given isa \p Version. +unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version); /// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version. -unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt); +unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); /// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version. -unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt); +unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); /// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version. -unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt); +unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); /// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and /// \p Lgkmcnt respectively. /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows: -/// \p Vmcnt = \p Waitcnt[3:0] +/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9 only) +/// \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14] (gfx9+ only) /// \p Expcnt = \p Waitcnt[6:4] /// \p Lgkmcnt = \p Waitcnt[11:8] -void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt, +void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt); /// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version. -unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt); +unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Vmcnt); /// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version. -unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt); +unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Expcnt); /// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version. -unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt); +unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Lgkmcnt); /// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa /// \p Version. /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows: -/// Waitcnt[3:0] = \p Vmcnt -/// Waitcnt[6:4] = \p Expcnt -/// Waitcnt[11:8] = \p Lgkmcnt +/// Waitcnt[3:0] = \p Vmcnt (pre-gfx9 only) +/// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9+ only) +/// Waitcnt[6:4] = \p Expcnt +/// Waitcnt[11:8] = \p Lgkmcnt +/// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9+ only) /// /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given /// isa \p Version. -unsigned encodeWaitcnt(IsaVersion Version, +unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt); unsigned getInitialPSInputAddr(const Function &F); -bool isShader(CallingConv::ID cc); -bool isCompute(CallingConv::ID cc); +LLVM_READNONE +bool isShader(CallingConv::ID CC); + +LLVM_READNONE +bool isCompute(CallingConv::ID CC); + +LLVM_READNONE +bool isEntryFunctionCC(CallingConv::ID CC); + +// FIXME: Remove this when calling conventions cleaned up +LLVM_READNONE +inline bool isKernel(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return true; + default: + return false; + } +} bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); bool isVI(const MCSubtargetInfo &STI); +bool isGFX9(const MCSubtargetInfo &STI); + +/// \brief Is Reg - scalar register +bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); + +/// \brief Is there any intersection between registers +bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI); /// If \p Reg is a pseudo reg, return the correct hardware register given /// \p STI otherwise return \p Reg. unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI); +/// \brief Convert hardware register \p Reg to a pseudo register +LLVM_READNONE +unsigned mc2PseudoReg(unsigned Reg); + /// \brief Can this operand also contain immediate values? bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo); @@ -188,6 +320,8 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: return 2; default: @@ -210,7 +344,21 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi); LLVM_READNONE bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi); +LLVM_READNONE +bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi); + +bool isUniformMMO(const MachineMemOperand *MMO); + +/// \returns The encoding that will be used for \p ByteOffset in the SMRD +/// offset field. +int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); + +/// \returns true if this offset is small enough to fit in the SMRD +/// offset field. \p ByteOffset should be the offset in bytes and +/// not the encoded offset. +bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); + } // end namespace AMDGPU } // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h index c55eaab..991408c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h @@ -87,7 +87,7 @@ COMPPGM1(enable_ieee_mode, compute_pgm_rsrc1_ieee_mode, IEEE // TODO: cdbg_user COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN), COMPPGM2(user_sgpr_count, compute_pgm_rsrc2_user_sgpr, USER_SGPR), -// TODO: enable_trap_handler +COMPPGM2(enable_trap_handler, compute_pgm_rsrc2_trap_handler, TRAP_HANDLER), COMPPGM2(enable_sgpr_workgroup_id_x, compute_pgm_rsrc2_tgid_x_en, TGID_X_EN), COMPPGM2(enable_sgpr_workgroup_id_y, compute_pgm_rsrc2_tgid_y_en, TGID_Y_EN), COMPPGM2(enable_sgpr_workgroup_id_z, compute_pgm_rsrc2_tgid_z_en, TGID_Z_EN), diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 8cae83c..96b33c3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -23,18 +23,27 @@ class VOP1e <bits<8> op, VOPProfile P> : Enc32 { class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> { bits<8> vdst; - + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = op; + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{31-25} = 0x3f; // encoding +} + +class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae <P> { + bits<8> vdst; + let Inst{8-0} = 0xf9; // sdwa let Inst{16-9} = op; let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); let Inst{31-25} = 0x3f; // encoding } -class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : +class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> : InstSI <P.Outs32, P.Ins32, "", pattern>, VOP <opName>, - SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>, - MnemonicAlias<opName#"_e32", opName> { + SIMCInstr <!if(VOP1Only, opName, opName#"_e32"), SIEncodingFamily.NONE>, + MnemonicAlias<!if(VOP1Only, opName, opName#"_e32"), opName> { let isPseudo = 1; let isCodeGenOnly = 1; @@ -75,6 +84,8 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let Uses = ps.Uses; } class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : @@ -83,10 +94,17 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : } class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { - list<dag> ret = !if(P.HasModifiers, - [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, - i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]); + list<dag> ret = + !if(P.HasModifiers, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + i32:$src0_modifiers, + i1:$clamp, i32:$omod))))], + !if(P.HasOMod, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0, + i1:$clamp, i32:$omod))))], + [(set P.DstVT:$vdst, (node P.Src0VT:$src0))] + ) + ); } multiclass VOP1Inst <string opName, VOPProfile P, @@ -96,6 +114,23 @@ multiclass VOP1Inst <string opName, VOPProfile P, def _sdwa : VOP1_SDWA_Pseudo <opName, P>; } +// Special profile for instructions which have clamp +// and output modifiers (but have no input modifiers) +class VOPProfileI2F<ValueType dstVt, ValueType srcVt> : + VOPProfile<[dstVt, srcVt, untyped, untyped]> { + + let Ins64 = (ins Src0RC64:$src0, clampmod:$clamp, omod:$omod); + let Asm64 = "$vdst, $src0$clamp$omod"; + + let HasModifiers = 0; + let HasClamp = 1; + let HasOMod = 1; +} + +def VOP1_F64_I32 : VOPProfileI2F <f64, i32>; +def VOP1_F32_I32 : VOPProfileI2F <f32, i32>; +def VOP1_F16_I16 : VOPProfileI2F <f16, i16>; + //===----------------------------------------------------------------------===// // VOP1 Instructions //===----------------------------------------------------------------------===// @@ -142,24 +177,24 @@ def V_READFIRSTLANE_B32 : let SchedRW = [WriteQuarterRate32] in { defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>; -defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP_F64_I32, sint_to_fp>; -defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>; -defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>; +defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>; +defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>; +defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>; defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>; -defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>; -defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>; +defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>; +defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; -defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP_F32_I32>; +defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32>; defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>; defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>; -defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0>; -defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1>; -defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2>; -defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3>; +defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>; +defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>; +defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>; +defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>; defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>; -defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP_F64_I32, uint_to_fp>; +defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>; } // End SchedRW = [WriteQuarterRate32] defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>; @@ -217,6 +252,7 @@ def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> { let Src0RC64 = VRegSrc_32; let HasExt = 0; + let HasSDWA9 = 0; } // Special case because there are no true output operands. Hack vdst @@ -232,16 +268,19 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0); let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); - let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + + let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel); let Asm32 = getAsm32<1, 1>.ret; - let Asm64 = getAsm64<1, 1, 0>.ret; + let Asm64 = getAsm64<1, 1, 0, 1>.ret; let AsmDPP = getAsmDPP<1, 1, 0>.ret; - let AsmSDWA = getAsmSDWA<1, 1, 0>.ret; + let AsmSDWA = getAsmSDWA<1, 1>.ret; + let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret; let HasExt = 0; + let HasSDWA9 = 0; let HasDst = 0; let EmitDst = 1; // force vdst emission } @@ -258,11 +297,14 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>; defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>; } // End Uses = [M0, EXEC] +let SchedRW = [WriteQuarterRate32] in { +defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>; +} + // These instruction only exist on SI and CI let SubtargetPredicate = isSICI in { let SchedRW = [WriteQuarterRate32] in { -defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>; defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>; defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>; defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>; @@ -295,10 +337,10 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>; } // End SubtargetPredicate = isCIVI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in { -defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>; -defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>; +defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>; +defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; @@ -318,7 +360,7 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in { def : Pat< (f32 (f16_to_fp i16:$src)), @@ -326,12 +368,31 @@ def : Pat< >; def : Pat< - (i16 (fp_to_f16 f32:$src)), + (i16 (AMDGPUfp_to_f16 f32:$src)), (V_CVT_F16_F32_e32 $src) >; } +def VOP_SWAP_I32 : VOPProfile<[i32, i32, i32, untyped]> { + let Outs32 = (outs VGPR_32:$vdst, VGPR_32:$vdst1); + let Ins32 = (ins VGPR_32:$src0, VGPR_32:$src1); + let Outs64 = Outs32; + let Asm32 = " $vdst, $src0"; + let Asm64 = ""; + let Ins64 = (ins); +} + +let SubtargetPredicate = isGFX9 in { + let Constraints = "$vdst = $src1, $vdst1 = $src0", + DisableEncoding="$vdst1,$src1", + SchedRW = [Write64Bit, Write64Bit] in { +// Never VOP3. Takes as long as 2 v_mov_b32s +def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>; +} + +} // End SubtargetPredicate = isGFX9 + //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// @@ -453,6 +514,14 @@ class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> : let Inst{31-25} = 0x3f; //encoding } +multiclass VOP1Only_Real_vi <bits<10> op> { + let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { + def _vi : + VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>; + } +} + multiclass VOP1_Real_vi <bits<10> op> { let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { def _e32_vi : @@ -467,6 +536,10 @@ multiclass VOP1_Real_vi <bits<10> op> { VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + // For now left dpp only for asm/dasm // TODO: add corresponding pseudo def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; @@ -480,6 +553,7 @@ defm V_CVT_F32_I32 : VOP1_Real_vi <0x5>; defm V_CVT_F32_U32 : VOP1_Real_vi <0x6>; defm V_CVT_U32_F32 : VOP1_Real_vi <0x7>; defm V_CVT_I32_F32 : VOP1_Real_vi <0x8>; +defm V_MOV_FED_B32 : VOP1_Real_vi <0x9>; defm V_CVT_F16_F32 : VOP1_Real_vi <0xa>; defm V_CVT_F32_F16 : VOP1_Real_vi <0xb>; defm V_CVT_RPI_I32_F32 : VOP1_Real_vi <0xc>; @@ -547,7 +621,7 @@ defm V_RNDNE_F16 : VOP1_Real_vi <0x47>; defm V_FRACT_F16 : VOP1_Real_vi <0x48>; defm V_SIN_F16 : VOP1_Real_vi <0x49>; defm V_COS_F16 : VOP1_Real_vi <0x4a>; - +defm V_SWAP_B32 : VOP1Only_Real_vi <0x51>; // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR // indexing mode. vdst can't be treated as a def for codegen purposes, diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 00e5ab3..d5acb49 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -40,12 +40,24 @@ class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 { class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> { bits<8> vdst; bits<8> src1; - + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding +} + +class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> { + bits<8> vdst; + bits<9> src1; + let Inst{8-0} = 0xf9; // sdwa let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); let Inst{30-25} = op; let Inst{31} = 0x0; // encoding + let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr } class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> : @@ -93,6 +105,8 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let Uses = ps.Uses; } class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : @@ -103,7 +117,10 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (node (P.Src0VT + !if(P.HasOMod, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]); } @@ -119,11 +136,9 @@ multiclass VOP2Inst <string opName, def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; - def _sdwa : VOP2_SDWA_Pseudo <opName, P>, - Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>; + def _sdwa : VOP2_SDWA_Pseudo <opName, P>; } -// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst multiclass VOP2bInst <string opName, VOPProfile P, SDPatternOperator node = null_frag, @@ -134,10 +149,12 @@ multiclass VOP2bInst <string opName, let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { def _e32 : VOP2_Pseudo <opName, P>, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; - - def _sdwa : VOP2_SDWA_Pseudo <opName, P>, - Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>; + + def _sdwa : VOP2_SDWA_Pseudo <opName, P> { + let AsmMatchConverter = "cvtSdwaVOP2b"; + } } + def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; } @@ -154,6 +171,7 @@ multiclass VOP2eInst <string opName, def _e32 : VOP2_Pseudo <opName, P>, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; } + def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; } @@ -162,8 +180,11 @@ multiclass VOP2eInst <string opName, class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm); - field string Asm32 = "$vdst, $src0, $src1, $imm"; field bit HasExt = 0; + + // Hack to stop printing _e64 + let DstRC = RegisterOperand<VGPR_32>; + field string Asm32 = " $vdst, $src0, $src1, $imm"; } def VOP_MADAK_F16 : VOP_MADAK <f16>; @@ -172,45 +193,55 @@ def VOP_MADAK_F32 : VOP_MADAK <f32>; class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1); - field string Asm32 = "$vdst, $src0, $imm, $src1"; field bit HasExt = 0; + + // Hack to stop printing _e64 + let DstRC = RegisterOperand<VGPR_32>; + field string Asm32 = " $vdst, $src0, $imm, $src1"; } def VOP_MADMK_F16 : VOP_MADMK <f16>; def VOP_MADMK_F32 : VOP_MADMK <f32>; +// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory +// and processing time but it makes it easier to convert to mad. class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, - HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret; + HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, VGPR_32:$src2, // stub argument - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); let Asm32 = getAsm32<1, 2, vt>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret; let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; - let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret; + let AsmSDWA = getAsmSDWA<1, 2, vt>.ret; + let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret; let HasSrc2 = 0; let HasSrc2Mods = 0; let HasExt = 1; + let HasSDWA9 = 0; } def VOP_MAC_F16 : VOP_MAC <f16> { // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives // 'not a string initializer' error. - let Asm64 = getAsm64<1, 2, HasModifiers, f16>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f16>.ret; } def VOP_MAC_F32 : VOP_MAC <f32> { // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives // 'not a string initializer' error. - let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f32>.ret; } // Write out to vcc or arbitrary SGPR. @@ -218,6 +249,7 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { let Asm32 = "$vdst, vcc, $src0, $src1"; let Asm64 = "$vdst, $sdst, $src0, $src1"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; + let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); @@ -235,6 +267,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; let Asm64 = "$vdst, $sdst, $src0, $src1, $src2"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; + let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); @@ -243,9 +276,10 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { // implicit VCC use. let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); - let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0, - Src1Mod:$src1_modifiers, Src1SDWA:$src1, - clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0, @@ -253,6 +287,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let HasExt = 1; + let HasSDWA9 = 1; } // Read in from vcc or arbitrary SGPR @@ -275,15 +310,19 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> { let Ins64 = Ins32; let Asm32 = " $vdst, $src0, $src1"; let Asm64 = Asm32; + let HasExt = 0; + let HasSDWA9 = 0; } def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> { let Outs32 = (outs VGPR_32:$vdst); let Outs64 = Outs32; - let Ins32 = (ins SReg_32:$src0, SCSrc_b32:$src1); + let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1); let Ins64 = Ins32; let Asm32 = " $vdst, $src0, $src1"; let Asm64 = Asm32; + let HasExt = 0; + let HasSDWA9 = 0; } //===----------------------------------------------------------------------===// @@ -293,7 +332,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> { let SubtargetPredicate = isGCN in { defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; -def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>; +def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">; let isCommutable = 1 in { defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>; @@ -323,7 +362,7 @@ let Constraints = "$vdst = $src2", DisableEncoding="$src2", defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>; } -def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>; +def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">; // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. @@ -346,20 +385,29 @@ def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">; } // End isConvergent = 1 -defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>; -defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32>; -defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>; -defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>; -defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>; -defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst" -defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>; -defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>; -defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16>; -defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>; -defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>; +defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>; +defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>; +defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>; +defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>; +defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>; +defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst" +defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>; +defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>; +defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>; +defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>; +defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>; } // End SubtargetPredicate = isGCN +def : Pat< + (AMDGPUadde i32:$src0, i32:$src1, i1:$src2), + (V_ADDC_U32_e64 $src0, $src1, $src2) +>; + +def : Pat< + (AMDGPUsube i32:$src0, i32:$src1, i1:$src2), + (V_SUBB_U32_e64 $src0, $src1, $src2) +>; // These instructions only exist on SI and CI let SubtargetPredicate = isSICI in { @@ -376,9 +424,9 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; } // End let SubtargetPredicate = SICI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in { -def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>; +def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>; @@ -389,7 +437,7 @@ defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>; defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>; defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">; defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; -def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>; +def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; @@ -407,7 +455,7 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; } } // End isCommutable = 1 -} // End SubtargetPredicate = isVI +} // End SubtargetPredicate = Has16BitInsts // Note: 16-bit instructions produce a 0 result in the high 16-bits. multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> { @@ -457,7 +505,7 @@ class ZExt_i16_i1_Pat <SDNode ext> : Pat < (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src) >; -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in { defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>; defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>; @@ -494,7 +542,15 @@ def : Pat < (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src) >; -} // End Predicates = [isVI] +// Undo sub x, c -> add x, -c canonicalization since c is more likely +// an inline immediate than -c. +// TODO: Also do for 64-bit. +def : Pat< + (add i16:$src0, (i16 NegSubInlineConst16:$src1)), + (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1) +>; + +} // End Predicates = [Has16BitInsts] //===----------------------------------------------------------------------===// // SI @@ -566,7 +622,10 @@ defm V_SUBB_U32 : VOP2be_Real_e32e64_si <0x29>; defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>; defm V_READLANE_B32 : VOP2_Real_si <0x01>; + +let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in { defm V_WRITELANE_B32 : VOP2_Real_si <0x02>; +} defm V_MAC_LEGACY_F32 : VOP2_Real_e32e64_si <0x6>; defm V_MIN_LEGACY_F32 : VOP2_Real_e32e64_si <0xd>; @@ -635,6 +694,17 @@ multiclass VOP2_Real_e64_vi <bits<10> op> { VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; } +multiclass VOP2_Real_e64only_vi <bits<10> op> { + def _e64_vi : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, + VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + // Hack to stop printing _e64 + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64"); + let OutOperandList = (outs VGPR_32:$vdst); + let AsmString = ps.Mnemonic # " " # ps.AsmOperands; + } +} + multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> { def _e64_vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, @@ -646,22 +716,28 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> : VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>; } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" - + multiclass VOP2_SDWA_Real <bits<6> op> { def _sdwa_vi : VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; } +multiclass VOP2_SDWA9_Real <bits<6> op> { + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; +} + multiclass VOP2be_Real_e32e64_vi <bits<6> op> : - Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> { + Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { // For now left dpp only for asm/dasm // TODO: add corresponding pseudo def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; } multiclass VOP2_Real_e32e64_vi <bits<6> op> : - Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> { + Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { // For now left dpp only for asm/dasm // TODO: add corresponding pseudo def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; @@ -702,17 +778,17 @@ defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi <0x1e>; defm V_READLANE_B32 : VOP32_Real_vi <0x289>; defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>; -defm V_BFM_B32 : VOP2_Real_e64_vi <0x293>; -defm V_BCNT_U32_B32 : VOP2_Real_e64_vi <0x28b>; -defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64_vi <0x28c>; -defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64_vi <0x28d>; -defm V_LDEXP_F32 : VOP2_Real_e64_vi <0x288>; -defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64_vi <0x1f0>; -defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64_vi <0x294>; -defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64_vi <0x295>; -defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64_vi <0x296>; -defm V_CVT_PK_U16_U32 : VOP2_Real_e64_vi <0x297>; -defm V_CVT_PK_I16_I32 : VOP2_Real_e64_vi <0x298>; +defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>; +defm V_BCNT_U32_B32 : VOP2_Real_e64only_vi <0x28b>; +defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64only_vi <0x28c>; +defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64only_vi <0x28d>; +defm V_LDEXP_F32 : VOP2_Real_e64only_vi <0x288>; +defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64only_vi <0x1f0>; +defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64only_vi <0x294>; +defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64only_vi <0x295>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64only_vi <0x296>; +defm V_CVT_PK_U16_U32 : VOP2_Real_e64only_vi <0x297>; +defm V_CVT_PK_I16_I32 : VOP2_Real_e64only_vi <0x298>; defm V_ADD_F16 : VOP2_Real_e32e64_vi <0x1f>; defm V_SUB_F16 : VOP2_Real_e32e64_vi <0x20>; @@ -740,9 +816,11 @@ let SubtargetPredicate = isVI in { // Aliases to simplify matching of floating-point instructions that // are VOP2 on SI and VOP3 on VI. -class SI2_VI3Alias <string name, Instruction inst> : InstAlias < +class SI2_VI3Alias <string name, VOP3_Real inst> : InstAlias < name#" $dst, $src0, $src1", - (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0) + !if(inst.Pfl.HasOMod, + (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0), + (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0)) >, PredicateControl { let UseInstAsmMatchConverter = 0; let AsmVariantName = AMDGPUAsmVariants.VOP3; diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td index c2a4d4b..92ed070 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -12,17 +12,41 @@ //===----------------------------------------------------------------------===// class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { + dag src0 = !if(P.HasOMod, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)); + list<dag> ret3 = [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (node (P.Src0VT src0), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))]; list<dag> ret2 = [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (node (P.Src0VT src0), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))]; list<dag> ret1 = [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))]; + (node (P.Src0VT src0)))]; + + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + +class getVOP3PModPat<VOPProfile P, SDPatternOperator node> { + list<dag> ret3 = [(set P.DstVT:$vdst, + (node (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))), + (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))]; + + list<dag> ret2 = [(set P.DstVT:$vdst, + (node !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), + (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))), + (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))]; + + list<dag> ret1 = [(set P.DstVT:$vdst, + (node (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, !if(!eq(P.NumSrcArgs, 2), ret2, @@ -72,6 +96,7 @@ class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> { class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { // v_div_scale_{f32|f64} do not support input modifiers. let HasModifiers = 0; + let HasOMod = 0; let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); let Asm64 = " $vdst, $sdst, $src0, $src1, $src2"; } @@ -86,6 +111,14 @@ def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> { let DstRC = RegisterOperand<VReg_64>; } +def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { + // FIXME: Hack to stop printing _e64 + let DstRC = RegisterOperand<VReg_64>; + + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Asm64 = " $vdst, $sdst, $src0, $src1, $src2"; +} + //===----------------------------------------------------------------------===// // VOP3 Instructions //===----------------------------------------------------------------------===// @@ -144,8 +177,8 @@ def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>; def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>; def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>; -def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; -def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbit>; +def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>; def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>; def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>; def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>; @@ -181,7 +214,10 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, } def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>; + +let Constraints = "@earlyclobber $vdst" in { def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>; +} // End Constraints = "@earlyclobber $vdst" def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> { let SchedRW = [WriteDouble]; @@ -204,25 +240,25 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>; let SubtargetPredicate = isCIVI in { -def V_MQSAD_U16_U8 : VOP3Inst <"v_mqsad_u16_u8", VOP3_Profile<VOP_I32_I32_I32>>; +let Constraints = "@earlyclobber $vdst" in { def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>; def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>; +} // End Constraints = "@earlyclobber $vdst" let isCommutable = 1 in { -def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3_Profile<VOP_I64_I32_I32_I64>>; - -// XXX - Does this set VCC? -def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3_Profile<VOP_I64_I32_I32_I64>>; +def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; +def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; } // End isCommutable = 1 } // End SubtargetPredicate = isCIVI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in { + +def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>; let isCommutable = 1 in { -def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>; def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>; def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile<VOP_F32_F32_F16>>; def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile<VOP_F32_F32_F16_F16>>; @@ -233,13 +269,16 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>; } // End isCommutable = 1 +} // End SubtargetPredicate = Has16BitInsts +let SubtargetPredicate = isVI in { +def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; } // End SubtargetPredicate = isVI -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in { -multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, - Instruction inst, SDPatternOperator op3> { +multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, + Instruction inst, SDPatternOperator op3> { def : Pat< (op2 (op1 i16:$src0, i16:$src1), i16:$src2), (inst i16:$src0, i16:$src1, i16:$src2) @@ -258,10 +297,34 @@ def : Pat< >; } -defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>; -defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>; +defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>; +defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>; + +} // End Predicates = [Has16BitInsts] + +let SubtargetPredicate = isGFX9 in { +def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>; +def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; -} // End Predicates = [isVI] +def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + +def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>; +def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>; +def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>; + +def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmin3>; +def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmin3>; +def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumin3>; + +def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmax3>; +def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmax3>; +def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumax3>; +} // End SubtargetPredicate = isGFX9 //===----------------------------------------------------------------------===// @@ -351,11 +414,18 @@ multiclass VOP3_Real_ci<bits<9> op> { } } -defm V_MQSAD_U16_U8 : VOP3_Real_ci <0x172>; +multiclass VOP3be_Real_ci<bits<9> op> { + def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> { + let AssemblerPredicates = [isCIOnly]; + let DecoderNamespace = "CI"; + } +} + defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>; -defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x174>; -defm V_MAD_U64_U32 : VOP3_Real_ci <0x176>; -defm V_MAD_I64_I32 : VOP3_Real_ci <0x177>; +defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x175>; +defm V_MAD_U64_U32 : VOP3be_Real_ci <0x176>; +defm V_MAD_I64_I32 : VOP3be_Real_ci <0x177>; //===----------------------------------------------------------------------===// // VI @@ -375,9 +445,8 @@ multiclass VOP3be_Real_vi<bits<10> op> { } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" -defm V_MQSAD_U16_U8 : VOP3_Real_vi <0x172>; -defm V_MAD_U64_U32 : VOP3_Real_vi <0x176>; -defm V_MAD_I64_I32 : VOP3_Real_vi <0x177>; +defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; +defm V_MAD_I64_I32 : VOP3be_Real_vi <0x1E9>; defm V_MAD_LEGACY_F32 : VOP3_Real_vi <0x1c0>; defm V_MAD_F32 : VOP3_Real_vi <0x1c1>; @@ -424,6 +493,8 @@ defm V_MAD_F16 : VOP3_Real_vi <0x1ea>; defm V_MAD_U16 : VOP3_Real_vi <0x1eb>; defm V_MAD_I16 : VOP3_Real_vi <0x1ec>; +defm V_PERM_B32 : VOP3_Real_vi <0x1ed>; + defm V_FMA_F16 : VOP3_Real_vi <0x1ee>; defm V_DIV_FIXUP_F16 : VOP3_Real_vi <0x1ef>; @@ -449,3 +520,25 @@ defm V_LSHLREV_B64 : VOP3_Real_vi <0x28f>; defm V_LSHRREV_B64 : VOP3_Real_vi <0x290>; defm V_ASHRREV_I64 : VOP3_Real_vi <0x291>; defm V_TRIG_PREOP_F64 : VOP3_Real_vi <0x292>; + +defm V_LSHL_ADD_U32 : VOP3_Real_vi <0x1fd>; +defm V_ADD_LSHL_U32 : VOP3_Real_vi <0x1fe>; +defm V_ADD3_U32 : VOP3_Real_vi <0x1ff>; +defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200>; +defm V_AND_OR_B32 : VOP3_Real_vi <0x201>; +defm V_OR3_B32 : VOP3_Real_vi <0x202>; +defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>; + +defm V_XAD_U32 : VOP3_Real_vi <0x1f3>; + +defm V_MIN3_F16 : VOP3_Real_vi <0x1f4>; +defm V_MIN3_I16 : VOP3_Real_vi <0x1f5>; +defm V_MIN3_U16 : VOP3_Real_vi <0x1f6>; + +defm V_MAX3_F16 : VOP3_Real_vi <0x1f7>; +defm V_MAX3_I16 : VOP3_Real_vi <0x1f8>; +defm V_MAX3_U16 : VOP3_Real_vi <0x1f9>; + +defm V_MED3_F16 : VOP3_Real_vi <0x1fa>; +defm V_MED3_I16 : VOP3_Real_vi <0x1fb>; +defm V_MED3_U16 : VOP3_Real_vi <0x1fc>; diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td new file mode 100644 index 0000000..3becf75 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -0,0 +1,102 @@ +//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VOP3P Classes +//===----------------------------------------------------------------------===// + +class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> : + VOP3P_Pseudo<OpName, P, + !if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret) +>; + +// Non-packed instructions that use the VOP3P encoding. +// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. +class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> : + VOP3P_Pseudo<OpName, P> { + let InOperandList = + (ins + FP32InputMods:$src0_modifiers, VCSrc_f32:$src0, + FP32InputMods:$src1_modifiers, VCSrc_f32:$src1, + FP32InputMods:$src2_modifiers, VCSrc_f32:$src2, + clampmod:$clamp, + op_sel:$op_sel, + op_sel_hi:$op_sel_hi); + let AsmOperands = + " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; +} + +let isCommutable = 1 in { +def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>; +def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; +def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; + +def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>; +def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>; +def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>; +def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>; + +def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>; +def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; +def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>; + +def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>; +def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>; +def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>; +def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>; +} + +def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; +def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>; + +def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>; +def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; +def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; + +// XXX - Commutable? +// These are VOP3a-like opcodes which accept no omod. +// Size of src arguments (16/32) is controlled by op_sel. +// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi. +def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_V2F16_V2F16_V2F16>>; +def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>; +def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>; + + +multiclass VOP3P_Real_vi<bits<10> op> { + def _vi : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3Pe <op, !cast<VOP3P_Pseudo>(NAME).Pfl> { + let AssemblerPredicates = [HasVOP3PInsts]; + let DecoderNamespace = "VI"; + } +} + +defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>; +defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>; +defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>; +defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>; +defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x384>; +defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>; +defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>; +defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>; +defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>; +defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>; + +defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>; +defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>; +defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>; +defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>; +defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>; +defm V_PK_ADD_F16 : VOP3P_Real_vi <0x38f>; +defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>; +defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>; +defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>; + +defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>; +defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; +defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 16a456d..b636fc9 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -34,6 +34,17 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> { let Inst{44-43} = SDWA.UNUSED_PRESERVE; } +class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> { + bits<9> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; // encoding + let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr +} + + //===----------------------------------------------------------------------===// // VOPC classes //===----------------------------------------------------------------------===// @@ -93,6 +104,8 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let Uses = ps.Uses; } class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : @@ -135,6 +148,19 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> : let SubtargetPredicate = AssemblerPredicate; } +class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies { + list<dag> ret = !if(P.HasModifiers, + [(set i1:$sdst, + (setcc (P.Src0VT + !if(P.HasOMod, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + cond))], + [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]); +} + + multiclass VOPC_Pseudos <string opName, VOPC_Profile P, PatLeaf cond = COND_NULL, @@ -150,14 +176,7 @@ multiclass VOPC_Pseudos <string opName, let isCommutable = 1; } - def _e64 : VOP3_Pseudo<opName, P, - !if(P.HasModifiers, - [(set i1:$sdst, - (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - cond))], - [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))])>, + def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)> { let Defs = !if(DefExec, [EXEC], []); let SchedRW = P.Schedule; @@ -165,13 +184,11 @@ multiclass VOPC_Pseudos <string opName, let isCommutable = 1; } - def _sdwa : VOPC_SDWA_Pseudo <opName, P>, - Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)> { + def _sdwa : VOPC_SDWA_Pseudo <opName, P> { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = P.Schedule; let isConvergent = DefExec; let isCompare = 1; - let isCommutable = 1; } } @@ -517,9 +534,11 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> : VOPC_Profile<sched, vt, i32> { let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); let Asm64 = "$sdst, $src0_modifiers, $src1"; + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel); + let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel"; let HasSrc1Mods = 0; let HasClamp = 0; @@ -563,7 +582,7 @@ multiclass VOPC_CLASS_F16 <string opName> : VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>; multiclass VOPCX_CLASS_F16 <string opName> : - VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>; + VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 1>; multiclass VOPC_CLASS_F32 <string opName> : VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>; @@ -621,7 +640,7 @@ class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat < (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), (inst $src0_modifiers, $src0, $src1_modifiers, $src1, - DSTCLAMP.NONE, DSTOMOD.NONE) + DSTCLAMP.NONE) >; def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>; @@ -920,6 +939,10 @@ multiclass VOPC_Real_vi <bits<10> op> { VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, + VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"), !cast<Instruction>(NAME#"_e32_vi")> { let AssemblerPredicate = isVI; diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td index 5f72f97..b47538b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -51,12 +51,8 @@ class VOP3Common <dag outs, dag ins, string asm = "", let VOP3 = 1; - let AsmMatchConverter = - !if(!eq(VOP3Only,1), - "cvtVOP3", - !if(!eq(HasMods,1), "cvtVOP3_2_mod", "")); - let AsmVariantName = AMDGPUAsmVariants.VOP3; + let AsmMatchConverter = !if(!eq(HasMods,1), "cvtVOP3", ""); let isCodeGenOnly = 0; @@ -68,8 +64,9 @@ class VOP3Common <dag outs, dag ins, string asm = "", let hasPostISelHook = 1; } -class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3Only = 0> : - InstSI <P.Outs64, P.Ins64, "", pattern>, +class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], + bit VOP3Only = 0, bit isVOP3P = 0> : + InstSI <P.Outs64, !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64), "", pattern>, VOP <opName>, SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>, MnemonicAlias<opName#"_e64", opName> { @@ -79,7 +76,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3On let UseNamedOperandTable = 1; string Mnemonic = opName; - string AsmOperands = P.Asm64; + string AsmOperands = !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64); let Size = 8; let mayLoad = 0; @@ -100,23 +97,32 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3On let VOP3 = 1; let VALU = 1; + let FPClamp = P.HasFPClamp; let Uses = [EXEC]; let AsmVariantName = AMDGPUAsmVariants.VOP3; let AsmMatchConverter = - !if(!eq(VOP3Only,1), - "cvtVOP3", - !if(!eq(P.HasModifiers, 1), "cvtVOP3_2_mod", "")); + !if(!and(P.IsPacked, isVOP3P), + "cvtVOP3P", + !if(!or(P.HasModifiers, P.HasOMod), + "cvtVOP3", + "")); VOPProfile Pfl = P; } +class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> : + VOP3_Pseudo<opName, P, pattern, 1, 1> { + let VOP3P = 1; +} + class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { let isPseudo = 0; let isCodeGenOnly = 0; + let UseNamedOperandTable = 1; let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; @@ -128,8 +134,17 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let Uses = ps.Uses; + + VOPProfile Pfl = ps.Pfl; } +// XXX - Is there any reason to distingusih this from regular VOP3 +// here? +class VOP3P_Real<VOP3P_Pseudo ps, int EncodingFamily> : + VOP3_Real<ps, EncodingFamily>; + class VOP3a<VOPProfile P> : Enc64 { bits<2> src0_modifiers; bits<9> src0; @@ -197,6 +212,42 @@ class VOP3be <VOPProfile P> : Enc64 { let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); } +class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 { + bits<8> vdst; + // neg, neg_hi, op_sel put in srcN_modifiers + bits<4> src0_modifiers; + bits<9> src0; + bits<4> src1_modifiers; + bits<9> src1; + bits<4> src2_modifiers; + bits<9> src2; + bits<1> clamp; + + let Inst{7-0} = vdst; + let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 + let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 + let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2 + + let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0) + let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) + let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2) + + let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, 0); // op_sel_hi(2) + + let Inst{15} = !if(P.HasClamp, clamp{0}, 0); + + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = !if(P.HasSrc0, src0, 0); + let Inst{49-41} = !if(P.HasSrc1, src1, 0); + let Inst{58-50} = !if(P.HasSrc2, src2, 0); + let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, 0); // op_sel_hi(0) + let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, 0); // op_sel_hi(1) + let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo) + let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo) + let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) +} + class VOP3be_si <bits<9> op, VOPProfile P> : VOP3be<P> { let Inst{25-17} = op; } @@ -238,11 +289,65 @@ class VOP_SDWAe<VOPProfile P> : Enc64 { let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE); let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); - let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); + let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD); + let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0); let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); +} + +// GFX9 adds two features to SDWA: +// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD. +// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather +// than VGPRs (at most 1 can be an SGPR); +// b. OMOD is the standard output modifier (result *2, *4, /2) +// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This +// replaces OMOD and the dest fields with SD and SDST (SGPR destination) +// field. +// a. When SD=1, the SDST is used as the destination for the compare result; +// b. When SD=0, VCC is used. +// +// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA + +// gfx9 SDWA basic encoding +class VOP_SDWA9e<VOPProfile P> : Enc64 { + bits<9> src0; // {src0_sgpr{0}, src0{7-0}} + bits<3> src0_sel; + bits<2> src0_modifiers; // float: {abs,neg}, int {sext} + bits<3> src1_sel; + bits<2> src1_modifiers; + bits<1> src1_sgpr; + + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); + let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); + let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); + let Inst{55} = !if(P.HasSrc0, src0{8}, 0); + let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD); let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0); + let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); + let Inst{63} = 0; // src1_sgpr - should be specified in subclass +} + +// gfx9 SDWA-A +class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> { + bits<3> dst_sel; + bits<2> dst_unused; + bits<1> clamp; + bits<2> omod; + + let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD); + let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE); + let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); + let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0); +} + +// gfx9 SDWA-B +class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> { + bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}} + + let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0); + let Inst{47} = !if(P.EmitDst, sdst{7}, 0); } class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : @@ -250,25 +355,26 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : VOP <opName>, SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>, MnemonicAlias <opName#"_sdwa", opName> { - + let isPseudo = 1; let isCodeGenOnly = 1; let UseNamedOperandTable = 1; string Mnemonic = opName; string AsmOperands = P.AsmSDWA; + string AsmOperands9 = P.AsmSDWA9; let Size = 8; let mayLoad = 0; let mayStore = 0; - let hasSideEffects = 0; + let hasSideEffects = 0; let VALU = 1; let SDWA = 1; let Uses = [EXEC]; - - let SubtargetPredicate = isVI; - let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst); + + let SubtargetPredicate = !if(P.HasExt, HasSDWA, DisableInst); + let AssemblerPredicate = !if(P.HasExt, HasSDWA, DisableInst); let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA, AMDGPUAsmVariants.Disable); let DecoderNamespace = "SDWA"; @@ -278,7 +384,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, - SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA> { let isPseudo = 0; let isCodeGenOnly = 0; @@ -303,6 +409,35 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> : let TSFlags = ps.TSFlags; } +class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9> { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Defs = ps.Defs; + let Uses = ps.Uses; + let SchedRW = ps.SchedRW; + let hasSideEffects = ps.hasSideEffects; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + let SubtargetPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst); + let AssemblerPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst); + let AsmVariantName = !if(ps.Pfl.HasSDWA9, AMDGPUAsmVariants.SDWA9, + AMDGPUAsmVariants.Disable); + let DecoderNamespace = "SDWA9"; + + // Copy relevant pseudo op flags + let AsmMatchConverter = ps.AsmMatchConverter; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; +} + class VOP_DPPe<VOPProfile P> : Enc64 { bits<2> src0_modifiers; bits<8> src0; @@ -337,8 +472,8 @@ class VOP_DPP <string OpName, VOPProfile P> : let Size = 8; let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", ""); - let SubtargetPredicate = isVI; - let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst); + let SubtargetPredicate = HasDPP; + let AssemblerPredicate = !if(P.HasExt, HasDPP, DisableInst); let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let DecoderNamespace = "DPP"; @@ -348,3 +483,4 @@ include "VOPCInstructions.td" include "VOP1Instructions.td" include "VOP2Instructions.td" include "VOP3Instructions.td" +include "VOP3PInstructions.td" |